From 838d0b195c2d2c055736c0b2804cfe44e064384f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Apr 2023 13:44:52 +0200 Subject: [PATCH 01/96] [susy2] add susy_gg_tt.sa to the repository to allow code fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default HRDCOD=0 build presently fails ccache g++ -O3 -std=c++17 -I. -fPIC -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -c Parameters_MSSM_SLHA2.cc -o Parameters_MSSM_SLHA2.o In file included from Parameters_MSSM_SLHA2.cc:8: Parameters_MSSM_SLHA2.h:19:2: error: #error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1" 19 | #error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1" | ^~~~~ In file included from Parameters_MSSM_SLHA2.cc:8: Parameters_MSSM_SLHA2.h: In function ‘const Parameters_MSSM_SLHA2_dependentCouplings::DependentCouplings_sv Parameters_MSSM_SLHA2_dependentCouplings::computeDependentCouplings_fromG(const fptype_sv&)’: Parameters_MSSM_SLHA2.h:806:56: error: conversion from ‘fptype_sv’ {aka ‘__vector(4) double’} to non-scalar type ‘const mgOnGpu::cxsmpl’ requested 806 | constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); | ~~~~~~~~^~~~~~~~~ Parameters_MSSM_SLHA2.h:809:31: error: ‘mdl_I51x11’ was not declared in this scope 809 | out.GC_51 = -( cI * G * mdl_I51x11 ); | ^~~~~~~~~~ Parameters_MSSM_SLHA2.cc: In member function ‘void Parameters_MSSM_SLHA2::setIndependentParameters(SLHAReader&)’: Parameters_MSSM_SLHA2.cc:67:3: error: ‘indices’ was not declared in this scope 67 | indices[0] = 3; | ^~~~~~~ make[1]: *** [cudacpp_src.mk:236: Parameters_MSSM_SLHA2.o] Error 1 The non-default HRDCOD=1 however also fails, the first error being ccache g++ -O3 -std=c++17 -I. -fPIC -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -DMGONGPU_HARDCODE_PARAM -c Parameters_MSSM_SLHA2.cc -o Parameters_MSSM_SLHA2.o In file included from Parameters_MSSM_SLHA2.cc:8: Parameters_MSSM_SLHA2.h:380:51: error: call to non-‘constexpr’ function ‘mgOnGpu::cxsmpl mgOnGpu::conj(const mgOnGpu::cxsmpl&) [with FP = double]’ 380 | constexpr cxsmpl mdl_conjg__yu3x3 = conj( mdl_yu3x3 ); | ~~~~^~~~~~~~~~~~~ In file included from Parameters_MSSM_SLHA2.h:13, from Parameters_MSSM_SLHA2.cc:8: --- epochX/cudacpp/susy_gg_tt.sa/.clang-format | 226 ++ .../cudacpp/susy_gg_tt.sa/CMake/Compilers.txt | 2 + epochX/cudacpp/susy_gg_tt.sa/CMake/Macros.txt | 10 + .../cudacpp/susy_gg_tt.sa/CMake/Platforms.txt | 3 + epochX/cudacpp/susy_gg_tt.sa/CMakeLists.txt | 14 + .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 607 +++++ .../susy_gg_tt.sa/Cards/param_card.dat | 492 ++++ .../susy_gg_tt.sa/SubProcesses/Bridge.h | 519 +++++ .../SubProcesses/BridgeKernels.cc | 149 ++ .../SubProcesses/BridgeKernels.h | 134 ++ .../susy_gg_tt.sa/SubProcesses/CMakeLists.txt | 4 + .../SubProcesses/CrossSectionKernels.cc | 231 ++ .../SubProcesses/CrossSectionKernels.h | 133 ++ .../susy_gg_tt.sa/SubProcesses/CudaRuntime.h | 80 + .../SubProcesses/EventStatistics.h | 160 ++ .../susy_gg_tt.sa/SubProcesses/MadgraphTest.h | 300 +++ .../SubProcesses/MatrixElementKernels.cc | 237 ++ .../SubProcesses/MatrixElementKernels.h | 183 ++ .../SubProcesses/MemoryAccessAmplitudes.h | 150 ++ .../SubProcesses/MemoryAccessCouplings.h | 256 +++ .../SubProcesses/MemoryAccessCouplingsFixed.h | 70 + .../SubProcesses/MemoryAccessDenominators.h | 18 + .../SubProcesses/MemoryAccessGs.h | 148 ++ .../SubProcesses/MemoryAccessHelpers.h | 152 ++ .../SubProcesses/MemoryAccessMatrixElements.h | 132 ++ .../SubProcesses/MemoryAccessMomenta.h | 260 +++ .../SubProcesses/MemoryAccessNumerators.h | 18 + .../SubProcesses/MemoryAccessRandomNumbers.h | 132 ++ .../SubProcesses/MemoryAccessVectors.h | 122 + .../SubProcesses/MemoryAccessWavefunctions.h | 155 ++ .../SubProcesses/MemoryAccessWeights.h | 135 ++ .../SubProcesses/MemoryBuffers.h | 530 +++++ .../P1_Sigma_MSSM_SLHA2_gg_ttx/Bridge.h | 1 + .../BridgeKernels.cc | 1 + .../BridgeKernels.h | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/CMakeLists.txt | 24 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 1084 +++++++++ .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h | 166 ++ .../CrossSectionKernels.cc | 1 + .../CrossSectionKernels.h | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/CudaRuntime.h | 1 + .../EventStatistics.h | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/MadgraphTest.h | 1 + .../MatrixElementKernels.cc | 1 + .../MatrixElementKernels.h | 1 + .../MemoryAccessAmplitudes.h | 1 + .../MemoryAccessCouplings.h | 1 + .../MemoryAccessCouplingsFixed.h | 1 + .../MemoryAccessDenominators.h | 1 + .../MemoryAccessGs.h | 1 + .../MemoryAccessHelpers.h | 1 + .../MemoryAccessMatrixElements.h | 1 + .../MemoryAccessMomenta.h | 1 + .../MemoryAccessNumerators.h | 1 + .../MemoryAccessRandomNumbers.h | 1 + .../MemoryAccessVectors.h | 1 + .../MemoryAccessWavefunctions.h | 1 + .../MemoryAccessWeights.h | 1 + .../MemoryBuffers.h | 1 + .../RamboSamplingKernels.cc | 1 + .../RamboSamplingKernels.h | 1 + .../RandomNumberKernels.cc | 1 + .../RandomNumberKernels.h | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/check_sa.cc | 1120 +++++++++ .../P1_Sigma_MSSM_SLHA2_gg_ttx/cudacpp.mk | 1 + .../epoch_process_id.h | 11 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.cc | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.inc | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f | 84 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/fsampler.cc | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/fsampler.inc | 1 + .../gBridgeKernels.cu | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/gCPPProcess.cu | 1 + .../gCrossSectionKernels.cu | 1 + .../gMatrixElementKernels.cu | 1 + .../gRamboSamplingKernels.cu | 1 + .../gRandomNumberKernels.cu | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/gcheck_sa.cu | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/makefile | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/nvtx.h | 1 + .../ompnumthreads.h | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/perf.py | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/profile.sh | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/runTest.cc | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/testmisc.cc | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/testxxx.cc | 1 + .../testxxx_cc_ref.txt | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/timer.h | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/timermap.h | 1 + .../SubProcesses/RamboSamplingKernels.cc | 178 ++ .../SubProcesses/RamboSamplingKernels.h | 129 ++ .../SubProcesses/RandomNumberKernels.cc | 149 ++ .../SubProcesses/RandomNumberKernels.h | 146 ++ .../susy_gg_tt.sa/SubProcesses/cudacpp.mk | 798 +++++++ .../susy_gg_tt.sa/SubProcesses/fbridge.cc | 126 + .../susy_gg_tt.sa/SubProcesses/fbridge.inc | 66 + .../susy_gg_tt.sa/SubProcesses/fsampler.cc | 159 ++ .../susy_gg_tt.sa/SubProcesses/fsampler.inc | 37 + .../cudacpp/susy_gg_tt.sa/SubProcesses/nvtx.h | 69 + .../SubProcesses/ompnumthreads.h | 58 + .../susy_gg_tt.sa/SubProcesses/perf.py | 346 +++ .../susy_gg_tt.sa/SubProcesses/profile.sh | 182 ++ .../susy_gg_tt.sa/SubProcesses/runTest.cc | 251 ++ .../susy_gg_tt.sa/SubProcesses/testmisc.cc | 217 ++ .../susy_gg_tt.sa/SubProcesses/testxxx.cc | 323 +++ .../SubProcesses/testxxx_cc_ref.txt | 2044 +++++++++++++++++ .../susy_gg_tt.sa/SubProcesses/timer.h | 67 + .../susy_gg_tt.sa/SubProcesses/timermap.h | 156 ++ .../cudacpp/susy_gg_tt.sa/src/CMakeLists.txt | 5 + .../susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h | 963 ++++++++ .../src/Parameters_MSSM_SLHA2.cc | 1480 ++++++++++++ .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 890 +++++++ .../cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk | 268 +++ .../cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h | 234 ++ .../susy_gg_tt.sa/src/mgOnGpuCxtypes.h | 633 +++++ .../susy_gg_tt.sa/src/mgOnGpuFptypes.h | 87 + .../susy_gg_tt.sa/src/mgOnGpuVectors.h | 829 +++++++ epochX/cudacpp/susy_gg_tt.sa/src/rambo.h | 180 ++ epochX/cudacpp/susy_gg_tt.sa/src/read_slha.cc | 184 ++ epochX/cudacpp/susy_gg_tt.sa/src/read_slha.h | 41 + 120 files changed, 19597 insertions(+) create mode 100644 epochX/cudacpp/susy_gg_tt.sa/.clang-format create mode 100644 epochX/cudacpp/susy_gg_tt.sa/CMake/Compilers.txt create mode 100644 epochX/cudacpp/susy_gg_tt.sa/CMake/Macros.txt create mode 100644 epochX/cudacpp/susy_gg_tt.sa/CMake/Platforms.txt create mode 100644 epochX/cudacpp/susy_gg_tt.sa/CMakeLists.txt create mode 100644 epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt create mode 100644 epochX/cudacpp/susy_gg_tt.sa/Cards/param_card.dat create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CMakeLists.txt create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CudaRuntime.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/EventStatistics.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MadgraphTest.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessDenominators.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessHelpers.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMomenta.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessNumerators.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessVectors.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWeights.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/Bridge.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/BridgeKernels.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/BridgeKernels.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CMakeLists.txt create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CrossSectionKernels.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CrossSectionKernels.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CudaRuntime.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/EventStatistics.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MadgraphTest.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MatrixElementKernels.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MatrixElementKernels.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessAmplitudes.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessCouplings.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessCouplingsFixed.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessDenominators.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessGs.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessHelpers.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessMatrixElements.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessMomenta.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessNumerators.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessRandomNumbers.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessVectors.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessWavefunctions.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessWeights.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryBuffers.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RamboSamplingKernels.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RamboSamplingKernels.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/check_sa.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/cudacpp.mk create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/epoch_process_id.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.inc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fsampler.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fsampler.inc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gBridgeKernels.cu create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCPPProcess.cu create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCrossSectionKernels.cu create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gMatrixElementKernels.cu create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRamboSamplingKernels.cu create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRandomNumberKernels.cu create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gcheck_sa.cu create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/makefile create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/nvtx.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/ompnumthreads.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/perf.py create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/profile.sh create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/runTest.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testmisc.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testxxx.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testxxx_cc_ref.txt create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/timer.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/timermap.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.inc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/nvtx.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/ompnumthreads.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/perf.py create mode 100755 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/profile.sh create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx_cc_ref.txt create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timer.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timermap.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/CMakeLists.txt create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuFptypes.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/rambo.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/read_slha.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/read_slha.h diff --git a/epochX/cudacpp/susy_gg_tt.sa/.clang-format b/epochX/cudacpp/susy_gg_tt.sa/.clang-format new file mode 100644 index 0000000000..12afd69b12 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/.clang-format @@ -0,0 +1,226 @@ +# AV's draft .clang-format +# --- +# February 2022: latest draft for clang 13.0.0 (BasedOnStyle: Google) +# See https://releases.llvm.org/13.0.0/tools/clang/docs/ClangFormatStyleOptions.html +--- +Language: Cpp +BasedOnStyle: Google + +AccessModifierOffset: -2 # AV was -1 +AlignAfterOpenBracket: Align # AV ok +AlignArrayOfStructures: None # AV ok (alternative: Right, but code-generating it would be too complex) +AlignConsecutiveAssignments: None # AV ok +AlignConsecutiveBitFields: None # AV ok +AlignConsecutiveDeclarations: None # AV ok +AlignConsecutiveMacros: None # AV ok +AlignEscapedNewlines: DontAlign # AV was Left +AlignOperands: DontAlign # AV was Align +AlignTrailingComments: true # AV ok +AllowAllArgumentsOnNextLine: true # AV ok(?) +AllowAllConstructorInitializersOnNextLine: true # AV ok (NB: relevant only if ConstructorInitializerAllOnOneLineOrOnePerLine=true) +AllowAllParametersOfDeclarationOnNextLine: true # AV ok(?) +AllowShortBlocksOnASingleLine: Always # AV was Never +AllowShortEnumsOnASingleLine: true # AV ok +AllowShortCaseLabelsOnASingleLine: true # AV was false +AllowShortFunctionsOnASingleLine: All # AV ok +AllowShortLambdasOnASingleLine: All # AV ok +AllowShortIfStatementsOnASingleLine: WithoutElse # AV ok +AllowShortLoopsOnASingleLine: true # AV ok +###AlwaysBreakAfterDefinitionReturnType: None # AV keep defaults (deprecated) +#AlwaysBreakAfterReturnType: All # AV use this initially, then switch to TopLevelDefinitions! +AlwaysBreakAfterReturnType: TopLevelDefinitions # AV was None (altearnative: All?) +AlwaysBreakBeforeMultilineStrings: false # AV was true +AlwaysBreakTemplateDeclarations: Yes # AV ok +###AttributeMacros: # AV keep defaults (NB this is not about '__host__' attributes, see llvm/llvm-project/issues/45968) +### - __capability +BinPackArguments: false # AV was true +BinPackParameters: false # AV was true +BitFieldColonSpacing: Both # AV ok +BraceWrapping: # (NB: this is only relevant for "BreakBeforeBraces: Custom") + AfterCaseLabel: true # AV was false + AfterClass: true # AV was false + AfterControlStatement: Always # AV was Never + AfterEnum: true # AV was false + AfterFunction: true # AV was false + AfterNamespace: true # AV was false + AfterObjCDeclaration: true # AV was false + AfterStruct: true # AV was false + AfterUnion: true # AV was false + AfterExternBlock: true # AV was false (NB: does not work unless IndentExternBlock is AfterExternBlock?!) + BeforeCatch: true # AV was false + BeforeElse: true # AV was false + BeforeLambdaBody: true # AV was false + BeforeWhile: true # AV was false + IndentBraces: false # AV ok + SplitEmptyFunction: true # AV ok + SplitEmptyRecord: true # AV ok + SplitEmptyNamespace: true # AV ok +BreakAfterJavaFieldAnnotations: false +BreakBeforeBinaryOperators: None # AV ok +BreakBeforeBraces: Custom # AV was Attach (alternative: Allman) +BreakBeforeConceptDeclarations: true # AV ok +###BreakBeforeInheritanceComma: false # (obsolete???) +BreakBeforeTernaryOperators: true # AV ok +###BreakConstructorInitializersBeforeComma: true # AV was false (obsolete???) +BreakConstructorInitializers: BeforeComma # AV was BeforeColon +BreakInheritanceList: BeforeColon # AV ok (alternative: BeforeComma?) +BreakStringLiterals: false # AV was true +ColumnLimit: 0 # AV was 80 +###CommentPragmas: '^[^ ]*' # AV use SpacesInLineCommentPrefix Min=0 Max=1 to allow both "//comment" and "// comment" +CompactNamespaces: false # AV ok +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 2 # AV was 4 +ContinuationIndentWidth: 2 # AV was 4 +Cpp11BracedListStyle: true # AV ok +DeriveLineEnding: false # AV was true +DerivePointerAlignment: false # AV was true +DisableFormat: false # AV ok +EmptyLineAfterAccessModifier: Leave # AV was Never +EmptyLineBeforeAccessModifier: Leave # AV was LogicalBlock +ExperimentalAutoDetectBinPacking: false # AV ok ("use at your own risk") +FixNamespaceComments: false # AV was true +###ForEachMacros: # AV keep defaults +### - foreach +### - Q_FOREACH +### - BOOST_FOREACH +###IfMacros: # AV keep defaults +### - KJ_IF_MAYBE +IncludeBlocks: Regroup # AV ok +IncludeCategories: + - Regex: '^' + Priority: 4 # AV was 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^<.*\.h>' + Priority: 5 # AV was 1 + SortPriority: 0 + CaseSensitive: false + - Regex: '^<.*' + Priority: 6 # AV was 2 + SortPriority: 0 + CaseSensitive: false + - Regex: 'mgOnGpuConfig.h' + Priority: 1 # AV new + SortPriority: 0 + CaseSensitive: false + - Regex: 'mgOnGpu*.*' + Priority: 2 # AV new + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 3 # AV was 3 + SortPriority: 0 + CaseSensitive: false +###IncludeIsMainRegex: '([-_](test|unittest))?$' # AV keep defaults +###IncludeIsMainSourceRegex: '' # AV keep defaults +IndentAccessModifiers: false # AV ok +IndentCaseLabels: true # AV ok +IndentCaseBlocks: false # AV ok +IndentGotoLabels: false # AV was true +IndentPPDirectives: None # AV ok (NB: AfterHash and BeforeHash do not seem to work as intended) +###IndentExternBlock: Indent # AV was AfterExternBlock +IndentExternBlock: AfterExternBlock # AV ok (only with Custom BraceWrapping.AfterExternBlock = true) +IndentRequires: false # AV ok(?) +IndentWidth: 2 # AV ok +IndentWrappedFunctionNames: false # AV ok +###InsertTrailingCommas: None # AV keep defaults (Java only?) +###JavaScriptQuotes: Leave # AV irrelevant +###JavaScriptWrapImports: true # AV irrelevant +KeepEmptyLinesAtTheStartOfBlocks: false # AV ok +LambdaBodyIndentation: Signature # AV ok +###MacroBlockBegin: '' # AV keep defaults +###MacroBlockEnd: '' # AV keep defaults +MaxEmptyLinesToKeep: 1 # AV ok +NamespaceIndentation: All # AV was None +###ObjCBinPackProtocolList: Never # AV irrelevant +###ObjCBlockIndentWidth: 2 # AV irrelevant +###ObjCBreakBeforeNestedBlockParam: true # AV irrelevant +###ObjCSpaceAfterProperty: false # AV irrelevant +###ObjCSpaceBeforeProtocolList: true # AV irrelevant +###PenaltyBreakAssignment: 2 # AV keep defaults +###PenaltyBreakBeforeFirstCallParameter: 1 # AV keep defaults +###PenaltyBreakComment: 300 # AV keep defaults +###PenaltyBreakFirstLessLess: 120 # AV keep defaults +###PenaltyBreakString: 1000 # AV keep defaults +###PenaltyBreakTemplateDeclaration: 10 # AV keep defaults +###PenaltyExcessCharacter: 1000000 # AV keep defaults +###PenaltyReturnTypeOnItsOwnLine: 200 # AV keep defaults +###PenaltyIndentedWhitespace: 0 # AV keep defaults +PointerAlignment: Left # AV ok +PPIndentWidth: 0 # AV was -1 +###RawStringFormats: # AV keep defaults +### - Language: Cpp +### Delimiters: +### - cc +### - CC +### - cpp +### - Cpp +### - CPP +### - 'c++' +### - 'C++' +### CanonicalDelimiter: '' +### BasedOnStyle: google +### - Language: TextProto +### Delimiters: +### - pb +### - PB +### - proto +### - PROTO +### EnclosingFunctions: +### - EqualsProto +### - EquivToProto +### - PARSE_PARTIAL_TEXT_PROTO +### - PARSE_TEST_PROTO +### - PARSE_TEXT_PROTO +### - ParseTextOrDie +### - ParseTextProtoOrDie +### - ParseTestProto +### - ParsePartialTestProto +### CanonicalDelimiter: pb +### BasedOnStyle: google +ReferenceAlignment: Pointer # AV ok +ReflowComments: false # AV was true +ShortNamespaceLines: 1 # AV ok +SortIncludes: CaseSensitive # AV ok +###SortJavaStaticImport: Before # irrelevant +SortUsingDeclarations: false # AV was true +SpaceAfterCStyleCast: false # AV ok +SpaceAfterLogicalNot: false # AV ok +SpaceAfterTemplateKeyword: false # AV was true +SpaceAroundPointerQualifiers: Default # AV ok (alternative: Before?) +SpaceBeforeAssignmentOperators: true # AV ok +SpaceBeforeCaseColon: false # AV ok +SpaceBeforeCpp11BracedList: false # AV ok +SpaceBeforeCtorInitializerColon: true # AV ok +SpaceBeforeInheritanceColon: true # AV ok +SpaceBeforeParens: Never # AV was ControlStatements +SpaceBeforeRangeBasedForLoopColon: false # AV was true +SpaceBeforeSquareBrackets: false # AV ok +SpaceInEmptyBlock: false # AV ok +SpaceInEmptyParentheses: false # AV ok +SpacesBeforeTrailingComments: 1 # AV was 2 +SpacesInAngles: Never # AV ok +SpacesInConditionalStatement: false # AV ok (does this work?) +SpacesInContainerLiterals: false # AV was true +SpacesInCStyleCastParentheses: false # AV ok +SpacesInLineCommentPrefix: + Minimum: 0 # AV was 1 + Maximum: 1 # AV was -1 +SpacesInParentheses: true # AV was false +SpacesInSquareBrackets: false # AV ok +Standard: c++17 # AV was Auto +###StatementAttributeLikeMacros: # AV keep defaults +### - Q_EMIT +###StatementMacros: # AV keep defaults +### - Q_UNUSED +### - QT_REQUIRE_VERSION +###TabWidth: 8 # AV irrelevant if UseTab=Never? +UseCRLF: false # AV ok (but set DeriveLineEnding=false) +UseTab: Never # AV ok +###WhitespaceSensitiveMacros: # AV keep defaults +### - STRINGIZE +### - PP_STRINGIZE +### - BOOST_PP_STRINGIZE +### - NS_SWIFT_NAME +### - CF_SWIFT_NAME +... diff --git a/epochX/cudacpp/susy_gg_tt.sa/CMake/Compilers.txt b/epochX/cudacpp/susy_gg_tt.sa/CMake/Compilers.txt new file mode 100644 index 0000000000..eec4baed28 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/CMake/Compilers.txt @@ -0,0 +1,2 @@ +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED True) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CMake/Macros.txt b/epochX/cudacpp/susy_gg_tt.sa/CMake/Macros.txt new file mode 100644 index 0000000000..9a0e141b81 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/CMake/Macros.txt @@ -0,0 +1,10 @@ +MACRO(SUBDIRLIST result) + FILE(GLOB children RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/*) + SET(dirlist "") + FOREACH(child ${children}) + IF(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${child}) + LIST(APPEND dirlist ${child}) + ENDIF() + ENDFOREACH() + SET(${result} ${dirlist}) +ENDMACRO() diff --git a/epochX/cudacpp/susy_gg_tt.sa/CMake/Platforms.txt b/epochX/cudacpp/susy_gg_tt.sa/CMake/Platforms.txt new file mode 100644 index 0000000000..ab73e53db8 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/CMake/Platforms.txt @@ -0,0 +1,3 @@ +if (CMAKE_HOST_APPLE) + add_definitions(-DMGONGPU_HAS_NO_CURAND) +endif(CMAKE_HOST_APPLE) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CMakeLists.txt b/epochX/cudacpp/susy_gg_tt.sa/CMakeLists.txt new file mode 100644 index 0000000000..d3010411fc --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/CMakeLists.txt @@ -0,0 +1,14 @@ +# Minimal CMake configuration to build a functional CPU version + +cmake_minimum_required(VERSION 3.22) + +project(Madgraph4GPU) + +include(${PROJECT_SOURCE_DIR}/CMake/Platforms.txt) +include(${PROJECT_SOURCE_DIR}/CMake/Compilers.txt) +include(${PROJECT_SOURCE_DIR}/CMake/Macros.txt) + +set(PROJECT_GITROOT_DIR ${PROJECT_SOURCE_DIR}/../../..) + +add_subdirectory(src) +add_subdirectory(SubProcesses) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt new file mode 100644 index 0000000000..f4d9395bb9 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -0,0 +1,607 @@ +Note that this is a development version. +This version is intended for development/beta testing and NOT for production. +This version has not been fully tested (if at all) and might have limited user support (if at all) +Running MG5 in debug mode +************************************************************ +* * +* W E L C O M E to * +* M A D G R A P H 5 _ a M C @ N L O * +* * +* * +* * * * +* * * * * * +* * * * * 5 * * * * * +* * * * * * +* * * * +* * +* VERSION 3.5.0_lo_vect 2023-01-26 * +* * +* WARNING: UNKNOWN DEVELOPMENT VERSION. * +* WARNING: DO NOT USE FOR PRODUCTION * +* * +* * +* The MadGraph5_aMC@NLO Development Team - Find us at * +* https://server06.fynu.ucl.ac.be/projects/madgraph * +* and * +* http://amcatnlo.web.cern.ch/amcatnlo/ * +* * +* Type 'help' for in-line help. * +* Type 'tutorial' to learn how MG5 works * +* Type 'tutorial aMCatNLO' to learn how aMC@NLO works * +* Type 'tutorial MadLoop' to learn how MadLoop works * +* * +************************************************************ +load MG5 configuration from input/mg5_configuration.txt +fastjet-config does not seem to correspond to a valid fastjet-config executable (v3+). We will use fjcore instead. + Please set the 'fastjet'variable to the full (absolute) /PATH/TO/fastjet-config (including fastjet-config). + MG5_aMC> set fastjet /PATH/TO/fastjet-config + +eMELA-config does not seem to correspond to a valid eMELA-config executable. + Please set the 'fastjet'variable to the full (absolute) /PATH/TO/eMELA-config (including eMELA-config). + MG5_aMC> set eMELA /PATH/TO/eMELA-config + +lhapdf-config does not seem to correspond to a valid lhapdf-config executable. +Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). +Note that you can still compile and run aMC@NLO with the built-in PDFs + MG5_aMC> set lhapdf /PATH/TO/lhapdf-config + +None does not seem to correspond to a valid lhapdf-config executable. +Please set the 'lhapdf' variable to the (absolute) /PATH/TO/lhapdf-config (including lhapdf-config). +Note that you can still compile and run aMC@NLO with the built-in PDFs + MG5_aMC> set lhapdf /PATH/TO/lhapdf-config + +Using default text editor "vi". Set another one in ./input/mg5_configuration.txt +No valid eps viewer found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt.mg +The import format was not given, so we guess it as command +set stdout_level DEBUG +set output information to level: 10 +set zerowidth_tchannel F +import model MSSM_SLHA2; generate g g > t t~ +INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . +INFO: Detect SLHA2 format. keeping restricted parameter in the param_card +DEBUG: Simplifying conditional expressions  +DEBUG: coupling with small value GC_854: (8.881784197001252e-16+0j) treated as zero  +DEBUG: coupling with small value GC_857: (-8.881784197001252e-16+0j) treated as zero  +DEBUG: coupling with small value GC_888: -8.881784197001252e-16j treated as zero  +DEBUG: coupling with small value GC_889: 2.220446049250313e-16j treated as zero  +DEBUG: remove interactions: a0 sd3 sd3 at order: QED=1  +DEBUG: remove interactions: a0 sd6 sd6 at order: QED=1  +DEBUG: remove interactions: a0 sl3- sl3- at order: QED=1  +DEBUG: remove interactions: a0 sl6- sl6- at order: QED=1  +DEBUG: remove interactions: a w+ h+ at order: QED=1  +DEBUG: remove interactions: a w+ h+ at order: QED=1  +DEBUG: remove interactions: w+ z h+ at order: QED=1  +DEBUG: remove interactions: w+ z h+ at order: QED=1  +DEBUG: remove interactions: a0 su3 su3 at order: QED=1  +DEBUG: remove interactions: a0 su6 su6 at order: QED=1  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_106', 1), ('GC_107', 1), ('GC_110', -1), ('GC_111', -1), ('GC_114', 1), ('GC_115', 1), ('GC_118', -1), ('GC_119', -1), ('GC_498', 1), ('GC_503', 1), ('GC_518', -1), ('GC_523', -1), ('GC_582', 1), ('GC_587', 1), ('GC_602', -1), ('GC_607', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_123', 1), ('GC_125', 1), ('GC_126', 1), ('GC_131', 1), ('GC_132', 1), ('GC_137', 1), ('GC_138', 1), ('GC_836', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_128', 1), ('GC_129', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_10', 1), ('GC_13', 1), ('GC_22', 1), ('GC_25', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_134', 1), ('GC_135', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_11', 1), ('GC_14', 1), ('GC_23', 1), ('GC_26', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_140', 1), ('GC_141', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_143', 1), ('GC_144', 1), ('GC_145', 1), ('GC_146', 1), ('GC_181', -1), ('GC_182', -1), ('GC_187', -1), ('GC_188', -1), ('GC_191', 1), ('GC_192', 1), ('GC_197', 1), ('GC_198', 1), ('GC_403', 1), ('GC_404', 1), ('GC_405', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_12', 1), ('GC_15', 1), ('GC_24', 1), ('GC_27', 1), ('GC_84', 1), ('GC_87', 1), ('GC_9', 1), ('GC_96', 1), ('GC_99', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_149', 1), ('GC_150', 1), ('GC_153', 1), ('GC_154', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_151', 1), ('GC_155', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_152', 1), ('GC_156', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_157', 1), ('GC_159', 1), ('GC_169', 1), ('GC_171', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_158', 1), ('GC_160', 1), ('GC_170', 1), ('GC_172', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_161', 1), ('GC_173', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_162', 1), ('GC_174', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_163', 1), ('GC_175', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_164', 1), ('GC_176', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_165', 1), ('GC_177', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_166', 1), ('GC_178', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_167', 1), ('GC_179', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_168', 1), ('GC_180', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_183', 1), ('GC_193', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_184', 1), ('GC_194', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_185', 1), ('GC_195', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_186', 1), ('GC_196', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_189', 1), ('GC_199', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_190', 1), ('GC_200', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_201', 1), ('GC_202', 1), ('GC_211', 1), ('GC_212', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_207', 1), ('GC_208', 1), ('GC_217', 1), ('GC_218', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_203', 1), ('GC_213', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_204', 1), ('GC_214', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_205', 1), ('GC_215', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_206', 1), ('GC_216', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_209', 1), ('GC_219', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_210', 1), ('GC_220', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_221', 1), ('GC_225', 1), ('GC_519', 1), ('GC_524', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_222', 1), ('GC_226', 1), ('GC_520', 1), ('GC_525', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_223', 1), ('GC_227', 1), ('GC_521', 1), ('GC_526', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_224', 1), ('GC_228', 1), ('GC_522', 1), ('GC_527', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_229', 1), ('GC_233', 1), ('GC_554', 1), ('GC_558', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_230', 1), ('GC_234', 1), ('GC_555', 1), ('GC_559', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_231', 1), ('GC_235', 1), ('GC_556', 1), ('GC_560', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_232', 1), ('GC_236', 1), ('GC_557', 1), ('GC_561', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_237', 1), ('GC_241', 1), ('GC_603', 1), ('GC_608', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_238', 1), ('GC_242', 1), ('GC_604', 1), ('GC_609', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_239', 1), ('GC_243', 1), ('GC_605', 1), ('GC_610', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_240', 1), ('GC_244', 1), ('GC_606', 1), ('GC_611', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_246', 1), ('GC_247', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_248', 1), ('GC_251', -1), ('GC_252', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_250', 1), ('GC_253', -1), ('GC_254', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_249', 1), ('GC_255', -1), ('GC_256', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_257', 1), ('GC_258', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_259', 1), ('GC_261', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_260', 1), ('GC_262', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_263', 1), ('GC_265', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_264', 1), ('GC_266', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_267', 1), ('GC_268', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_269', 1), ('GC_270', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_271', 1), ('GC_272', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_273', 1), ('GC_274', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_276', 1), ('GC_277', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_19', 1), ('GC_28', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_279', 1), ('GC_281', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_280', 1), ('GC_282', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_285', 1), ('GC_287', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_286', 1), ('GC_288', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_20', 1), ('GC_29', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_291', 1), ('GC_293', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_292', 1), ('GC_294', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_297', 1), ('GC_299', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_21', 1), ('GC_30', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_298', 1), ('GC_300', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_303', 1), ('GC_304', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_306', 1), ('GC_307', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_309', 1), ('GC_310', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_312', 1), ('GC_313', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_315', 1), ('GC_316', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_318', 1), ('GC_319', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_321', 1), ('GC_322', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_324', 1), ('GC_325', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_327', 1), ('GC_328', 1), ('GC_947', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_330', 1), ('GC_331', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_333', 1), ('GC_334', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_336', 1), ('GC_337', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_3', 1), ('GC_34', -1), ('GC_35', -1), ('GC_38', -1), ('GC_39', -1), ('GC_4', -1), ('GC_936', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_363', 1), ('GC_364', 1), ('GC_365', 1), ('GC_366', 1), ('GC_622', 1), ('GC_623', 1), ('GC_624', 1), ('GC_625', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_374', 1), ('GC_375', 1), ('GC_376', 1), ('GC_377', 1), ('GC_641', 1), ('GC_642', 1), ('GC_643', 1), ('GC_644', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_385', 1), ('GC_386', 1), ('GC_387', 1), ('GC_388', 1), ('GC_389', 1), ('GC_660', 1), ('GC_661', 1), ('GC_662', 1), ('GC_663', 1), ('GC_664', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_394', 1), ('GC_395', 1), ('GC_396', 1), ('GC_397', 1), ('GC_398', 1), ('GC_677', 1), ('GC_678', 1), ('GC_679', 1), ('GC_680', 1), ('GC_681', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_37', 1), ('GC_40', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_406', 1), ('GC_407', 1), ('GC_499', 1), ('GC_504', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_408', 1), ('GC_409', 1), ('GC_538', 1), ('GC_542', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_410', 1), ('GC_411', 1), ('GC_412', 1), ('GC_570', 1), ('GC_574', 1), ('GC_578', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_413', 1), ('GC_414', 1), ('GC_583', 1), ('GC_588', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_42', 1), ('GC_43', 1), ('GC_46', 1), ('GC_47', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_429', 1), ('GC_430', 1), ('GC_500', 1), ('GC_505', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_431', 1), ('GC_432', 1), ('GC_539', 1), ('GC_543', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_433', 1), ('GC_434', 1), ('GC_435', 1), ('GC_571', 1), ('GC_575', 1), ('GC_579', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_436', 1), ('GC_437', 1), ('GC_584', 1), ('GC_589', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_422', 1), ('GC_444', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_452', 1), ('GC_453', 1), ('GC_501', 1), ('GC_506', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_454', 1), ('GC_455', 1), ('GC_540', 1), ('GC_544', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_456', 1), ('GC_457', 1), ('GC_458', 1), ('GC_572', 1), ('GC_576', 1), ('GC_580', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_459', 1), ('GC_460', 1), ('GC_585', 1), ('GC_590', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_423', 1), ('GC_467', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_475', 1), ('GC_476', 1), ('GC_502', 1), ('GC_507', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_477', 1), ('GC_478', 1), ('GC_541', 1), ('GC_545', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_45', 1), ('GC_48', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_479', 1), ('GC_480', 1), ('GC_481', 1), ('GC_573', 1), ('GC_577', 1), ('GC_581', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_482', 1), ('GC_483', 1), ('GC_586', 1), ('GC_591', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_447', 1), ('GC_491', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_470', 1), ('GC_492', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_2', 1), ('GC_50', -1), ('GC_52', -1), ('GC_58', -1), ('GC_60', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_108', 1), ('GC_508', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_415', 1), ('GC_509', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_484', 1), ('GC_512', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_109', 1), ('GC_513', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_339', 1), ('GC_514', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_345', 1), ('GC_515', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_351', 1), ('GC_516', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_357', 1), ('GC_517', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_112', 1), ('GC_528', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_416', 1), ('GC_529', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_51', 1), ('GC_53', 1), ('GC_59', 1), ('GC_61', 1), ('GC_67', 1), ('GC_69', 1), ('GC_7', -1), ('GC_75', 1), ('GC_77', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_462', 1), ('GC_531', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_485', 1), ('GC_532', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_113', 1), ('GC_533', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_340', 1), ('GC_534', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_346', 1), ('GC_535', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_352', 1), ('GC_536', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_358', 1), ('GC_537', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_417', 1), ('GC_546', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_440', 1), ('GC_547', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_463', 1), ('GC_548', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_341', 1), ('GC_550', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_353', 1), ('GC_552', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_359', 1), ('GC_553', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_418', 1), ('GC_562', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_464', 1), ('GC_564', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_487', 1), ('GC_565', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_342', 1), ('GC_566', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_348', 1), ('GC_567', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_354', 1), ('GC_568', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_360', 1), ('GC_569', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_116', 1), ('GC_592', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_425', 1), ('GC_593', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_448', 1), ('GC_594', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_471', 1), ('GC_595', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_494', 1), ('GC_596', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_117', 1), ('GC_597', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_343', 1), ('GC_598', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_349', 1), ('GC_599', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_355', 1), ('GC_600', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_361', 1), ('GC_601', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_120', 1), ('GC_612', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_449', 1), ('GC_614', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_472', 1), ('GC_615', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_495', 1), ('GC_616', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_121', 1), ('GC_617', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_344', 1), ('GC_618', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_56', 1), ('GC_62', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_356', 1), ('GC_620', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_362', 1), ('GC_621', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_367', 1), ('GC_626', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_368', 1), ('GC_627', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_369', 1), ('GC_628', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_372', 1), ('GC_629', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_57', 1), ('GC_63', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_373', 1), ('GC_630', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_370', 1), ('GC_631', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_371', 1), ('GC_632', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_419', 1), ('GC_633', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_442', 1), ('GC_634', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_465', 1), ('GC_635', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_378', 1), ('GC_645', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_379', 1), ('GC_646', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_380', 1), ('GC_647', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_383', 1), ('GC_648', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_384', 1), ('GC_649', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_381', 1), ('GC_650', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_382', 1), ('GC_651', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_420', 1), ('GC_652', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_443', 1), ('GC_653', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_466', 1), ('GC_654', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_489', 1), ('GC_655', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_640', 1), ('GC_657', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_1', 1), ('GC_66', -1), ('GC_68', -1), ('GC_74', -1), ('GC_76', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_390', 1), ('GC_665', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_391', 1), ('GC_666', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_392', 1), ('GC_667', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_393', 1), ('GC_668', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_427', 1), ('GC_669', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_450', 1), ('GC_670', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_473', 1), ('GC_671', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_496', 1), ('GC_672', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_399', 1), ('GC_682', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_400', 1), ('GC_683', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_401', 1), ('GC_684', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_402', 1), ('GC_685', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_428', 1), ('GC_686', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_497', 1), ('GC_689', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_694', 1), ('GC_697', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_695', 1), ('GC_698', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_696', 1), ('GC_699', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_700', 1), ('GC_701', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_702', 1), ('GC_703', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_704', 1), ('GC_712', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_705', 1), ('GC_713', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_707', 1), ('GC_715', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_708', 1), ('GC_716', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_709', 1), ('GC_717', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_711', 1), ('GC_719', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_720', 1), ('GC_723', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_721', 1), ('GC_724', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_722', 1), ('GC_725', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_747', 1), ('GC_748', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_749', 1), ('GC_750', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_751', 1), ('GC_752', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_755', 1), ('GC_756', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_757', 1), ('GC_758', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_759', 1), ('GC_760', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_762', 1), ('GC_763', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_770', 1), ('GC_771', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_772', 1), ('GC_773', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_774', 1), ('GC_775', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_778', 1), ('GC_779', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_72', 1), ('GC_78', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_780', 1), ('GC_781', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_782', 1), ('GC_783', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_73', 1), ('GC_79', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_6', 1), ('GC_8', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_726', 1), ('GC_808', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_727', 1), ('GC_809', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_729', 1), ('GC_811', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_730', 1), ('GC_812', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_731', 1), ('GC_813', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_733', 1), ('GC_815', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_735', 1), ('GC_817', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_737', 1), ('GC_819', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_738', 1), ('GC_820', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_739', 1), ('GC_821', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_740', 1), ('GC_822', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_741', 1), ('GC_823', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_742', 1), ('GC_824', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_743', 1), ('GC_825', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_745', 1), ('GC_827', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_788', 1), ('GC_828', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_789', 1), ('GC_829', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_790', 1), ('GC_830', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_791', 1), ('GC_831', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_792', 1), ('GC_832', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_793', 1), ('GC_833', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_794', 1), ('GC_834', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_795', 1), ('GC_835', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_842', 1), ('GC_844', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_843', 1), ('GC_845', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_846', 1), ('GC_848', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_847', 1), ('GC_849', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_82', 1), ('GC_85', 1), ('GC_94', 1), ('GC_97', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_855', 1), ('GC_856', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_83', 1), ('GC_86', 1), ('GC_95', 1), ('GC_98', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_876', 1), ('GC_877', 1), ('GC_880', 1), ('GC_881', 1), ('GC_884', 1), ('GC_885', 1), ('GC_886', 1), ('GC_887', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_878', 1), ('GC_882', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_879', 1), ('GC_883', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_890', 1), ('GC_898', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_891', 1), ('GC_899', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_893', 1), ('GC_901', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_903', 1), ('GC_904', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_858', 1), ('GC_906', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_859', 1), ('GC_907', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_860', 1), ('GC_908', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_861', 1), ('GC_909', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_100', 1), ('GC_91', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_863', 1), ('GC_911', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_865', 1), ('GC_913', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_866', 1), ('GC_914', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_867', 1), ('GC_915', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_868', 1), ('GC_916', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_869', 1), ('GC_917', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_870', 1), ('GC_918', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_871', 1), ('GC_919', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_101', 1), ('GC_92', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_872', 1), ('GC_920', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_873', 1), ('GC_921', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_874', 1), ('GC_922', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_894', 1), ('GC_924', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_895', 1), ('GC_925', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_896', 1), ('GC_926', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_897', 1), ('GC_927', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_102', 1), ('GC_93', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_940', 1), ('GC_941', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_942', 1), ('GC_943', -1)  +DEBUG: fix parameter value: mdl_RRd1x1  +DEBUG: fix parameter value: mdl_RRd2x2  +DEBUG: fix parameter value: mdl_RRd4x4  +DEBUG: fix parameter value: mdl_RRd5x5  +DEBUG: fix parameter value: mdl_RRl1x1  +DEBUG: fix parameter value: mdl_RRl2x2  +DEBUG: fix parameter value: mdl_RRl4x4  +DEBUG: fix parameter value: mdl_RRl5x5  +DEBUG: fix parameter value: mdl_RRn1x1  +DEBUG: fix parameter value: mdl_RRn2x2  +DEBUG: fix parameter value: mdl_RRn3x3  +DEBUG: fix parameter value: mdl_RMNS1x1  +DEBUG: fix parameter value: mdl_RMNS2x2  +DEBUG: fix parameter value: mdl_RMNS3x3  +DEBUG: fix parameter value: mdl_RRu1x1  +DEBUG: fix parameter value: mdl_RRu2x2  +DEBUG: fix parameter value: mdl_RRu4x4  +DEBUG: fix parameter value: mdl_RRu5x5  +DEBUG: fix parameter value: mdl_RCKM1x1  +DEBUG: fix parameter value: mdl_RCKM2x2  +DEBUG: fix parameter value: mdl_RCKM3x3  +DEBUG: remove parameters: mdl_CKM1x1  +DEBUG: remove parameters: mdl_CKM2x2  +DEBUG: fix parameter value: mdl_CKM3x3  +DEBUG: fix parameter value: mdl_Rd1x1  +DEBUG: remove parameters: mdl_Rd2x2  +DEBUG: fix parameter value: mdl_Rd4x4  +DEBUG: remove parameters: mdl_Rd5x5  +DEBUG: fix parameter value: mdl_Rl1x1  +DEBUG: remove parameters: mdl_Rl2x2  +DEBUG: fix parameter value: mdl_Rl4x4  +DEBUG: remove parameters: mdl_Rl5x5  +DEBUG: fix parameter value: mdl_Rn1x1  +DEBUG: remove parameters: mdl_Rn2x2  +DEBUG: fix parameter value: mdl_Rn3x3  +DEBUG: fix parameter value: mdl_Ru1x1  +DEBUG: remove parameters: mdl_Ru2x2  +DEBUG: fix parameter value: mdl_Ru4x4  +DEBUG: remove parameters: mdl_Ru5x5  +DEBUG: fix parameter value: mdl_conjg__CKM3x3  +DEBUG: remove parameters: mdl_conjg__Rd4x4  +DEBUG: fix parameter value: mdl_I100x44  +DEBUG: remove parameters: mdl_conjg__Rd5x5  +DEBUG: remove parameters: mdl_I100x55  +DEBUG: remove parameters: mdl_conjg__Rl4x4  +DEBUG: fix parameter value: mdl_I101x44  +DEBUG: remove parameters: mdl_conjg__Rl5x5  +DEBUG: remove parameters: mdl_I101x55  +DEBUG: remove parameters: mdl_conjg__Ru4x4  +DEBUG: fix parameter value: mdl_I102x44  +DEBUG: remove parameters: mdl_conjg__Ru5x5  +DEBUG: remove parameters: mdl_I102x55  +DEBUG: remove parameters: mdl_conjg__Rd1x1  +DEBUG: fix parameter value: mdl_I12x11  +DEBUG: remove parameters: mdl_conjg__Rd2x2  +DEBUG: remove parameters: mdl_I12x22  +DEBUG: remove parameters: mdl_I13x44  +DEBUG: remove parameters: mdl_I13x55  +DEBUG: remove parameters: mdl_conjg__Rl1x1  +DEBUG: fix parameter value: mdl_I25x11  +DEBUG: remove parameters: mdl_conjg__Rl2x2  +DEBUG: remove parameters: mdl_I25x22  +DEBUG: fix parameter value: mdl_I26x44  +DEBUG: remove parameters: mdl_I26x55  +DEBUG: remove parameters: mdl_I29x11  +DEBUG: remove parameters: mdl_I29x22  +DEBUG: fix parameter value: mdl_I31x11  +DEBUG: remove parameters: mdl_I31x22  +DEBUG: remove parameters: mdl_I32x44  +DEBUG: remove parameters: mdl_I32x55  +DEBUG: remove parameters: mdl_conjg__Rn1x1  +DEBUG: fix parameter value: mdl_I39x11  +DEBUG: remove parameters: mdl_conjg__Rn2x2  +DEBUG: remove parameters: mdl_I39x22  +DEBUG: fix parameter value: mdl_conjg__Rn3x3  +DEBUG: remove parameters: mdl_I43x11  +DEBUG: remove parameters: mdl_I43x22  +DEBUG: remove parameters: mdl_I43x33  +DEBUG: remove parameters: mdl_I45x11  +DEBUG: remove parameters: mdl_I45x22  +DEBUG: fix parameter value: mdl_I5x11  +DEBUG: remove parameters: mdl_I5x22  +DEBUG: remove parameters: mdl_conjg__Ru1x1  +DEBUG: fix parameter value: mdl_I51x11  +DEBUG: remove parameters: mdl_conjg__Ru2x2  +DEBUG: remove parameters: mdl_I51x22  +DEBUG: fix parameter value: mdl_I52x44  +DEBUG: remove parameters: mdl_I52x55  +DEBUG: remove parameters: mdl_conjg__CKM1x1  +DEBUG: fix parameter value: mdl_I53x11  +DEBUG: remove parameters: mdl_conjg__CKM2x2  +DEBUG: remove parameters: mdl_I53x22  +DEBUG: fix parameter value: mdl_I6x44  +DEBUG: remove parameters: mdl_I6x55  +DEBUG: remove parameters: mdl_I63x11  +DEBUG: remove parameters: mdl_I63x22  +DEBUG: remove parameters: mdl_I66x11  +DEBUG: remove parameters: mdl_I66x22  +DEBUG: remove parameters: mdl_I7x11  +DEBUG: remove parameters: mdl_I7x22  +DEBUG: fix parameter value: mdl_I74x11  +DEBUG: remove parameters: mdl_I74x22  +DEBUG: remove parameters: mdl_I75x44  +DEBUG: remove parameters: mdl_I75x55  +DEBUG: fix parameter value: mdl_I82x11  +DEBUG: remove parameters: mdl_I82x22  +DEBUG: remove parameters: mdl_I85x11  +DEBUG: remove parameters: mdl_I85x22  +DEBUG: fix parameter value: mdl_I87x11  +DEBUG: remove parameters: mdl_I87x22  +DEBUG: remove parameters: mdl_I87x33  +DEBUG: remove parameters: mdl_I89x11  +DEBUG: remove parameters: mdl_I89x22  +DEBUG: fix parameter value: mdl_I92x11  +DEBUG: remove parameters: mdl_I92x22  +DEBUG: fix parameter value: mdl_I93x11  +DEBUG: remove parameters: mdl_I93x22  +DEBUG: remove parameters: mdl_I94x11  +DEBUG: remove parameters: mdl_I94x22  +DEBUG: remove parameters: mdl_I95x11  +DEBUG: remove parameters: mdl_I95x22  +DEBUG: fix parameter value: mdl_I96x11  +DEBUG: remove parameters: mdl_I96x22  +DEBUG: fix parameter value: mdl_I97x11  +DEBUG: remove parameters: mdl_I97x22  +DEBUG: fix parameter value: mdl_I98x11  +DEBUG: remove parameters: mdl_I98x22  +DEBUG: Parameters set to identical values: 1*RmD21x1, 1*RmD22x2  +DEBUG: Parameters set to identical values: 1*RmE21x1, 1*RmE22x2  +DEBUG: Parameters set to identical values: 1*RmL21x1, 1*RmL22x2  +DEBUG: Parameters set to identical values: 1*RmQ21x1, 1*RmQ22x2  +DEBUG: Parameters set to identical values: 1*RmU21x1, 1*RmU22x2  +DEBUG: Parameters set to identical values: 1*Msn1, 1*Msn2  +DEBUG: Parameters set to identical values: 1*Msl1, 1*Msl2  +DEBUG: Parameters set to identical values: 1*Msl4, 1*Msl5  +DEBUG: Parameters set to identical values: 1*Msu1, 1*Msu2  +DEBUG: Parameters set to identical values: 1*Msu4, 1*Msu5  +DEBUG: Parameters set to identical values: 1*Msd1, 1*Msd2  +DEBUG: Parameters set to identical values: 1*Msd4, 1*Msd5  +INFO: Change particles name to pass to MG5 convention +Defined multiparticle p = g u c d s u~ c~ d~ s~ +Defined multiparticle j = g u c d s u~ c~ d~ s~ +Defined multiparticle l+ = e+ mu+ +Defined multiparticle l- = e- mu- +Defined multiparticle vl = ve vm vt +Defined multiparticle vl~ = ve~ vm~ vt~ +Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ e+ mu+ go ul cl t1 ur cr t2 dl sl b1 dr sr b2 ul~ cl~ t1~ ur~ cr~ t2~ dl~ sl~ b1~ dr~ sr~ b2~ t b t~ b~ z w+ h01 h2 h3 h+ sve svm svt el- mul- ta1- er- mur- ta2- w- h- sve~ svm~ svt~ el+ mul+ ta1+ er+ mur+ ta2+ n1 n2 n3 n4 x1+ x2+ ta- x1- x2- ta+ +INFO: Checking for minimal orders which gives processes. +INFO: Please specify coupling orders to bypass this step. +INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED +INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 +INFO: Process has 3 diagrams +1 processes with 3 diagrams generated in 0.112 s +Total: 1 processes with 3 diagrams +output standalone_cudacpp CODEGEN_cudacpp_susy_gg_tt +Load PLUGIN.CUDACPP_SA_OUTPUT +Output will be done with PLUGIN: CUDACPP_SA_OUTPUT +DEBUG: cformat =  plugin [export_cpp.py at line 3071]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 143]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 148]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt +INFO: Organizing processes into subprocess groups +INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 +INFO: Processing color information for process: g g > t t~ @1 +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 173]  +DEBUG: type(subproc_group)= [output.py at line 174]  +DEBUG: type(fortran_model)= [output.py at line 175]  +DEBUG: type(me)= me=0 [output.py at line 176]  +INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx +DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1246]  +DEBUG: self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [model_handling.py at line 1250]  +FileWriter for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h +DEBUG: Entering PLUGIN_OneProcessExporter.write_process_h_file [model_handling.py at line 1389]  +FileWriter for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc +DEBUG: Entering PLUGIN_OneProcessExporter.write_process_cc_file [model_handling.py at line 1411]  +DEBUG: Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [model_handling.py at line 1107]  +DEBUG: self.include_multi_channel =  False [model_handling.py at line 1108]  +DEBUG: self.support_multichannel =  True [model_handling.py at line 1109]  +DEBUG: type(self.helas_call_writer) =  [model_handling.py at line 1203]  +DEBUG: self.support_multichannel, self.include_multi_channel =  True False [model_handling.py at line 1204]  +DEBUG: multi_channel_map =  None [model_handling.py at line 1590]  +DEBUG: diag_to_config =  {} [model_handling.py at line 1645]  +DEBUG: call =  vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [model_handling.py at line 1757]  +DEBUG: ('ZERO', 0, -1, 0, 0) [model_handling.py at line 1758]  +DEBUG: call =  vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [model_handling.py at line 1757]  +DEBUG: ('ZERO', 1, -1, 1, 1) [model_handling.py at line 1758]  +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. +DEBUG: Entering PLUGIN_OneProcessExporter.edit_CMakeLists [model_handling.py at line 1279]  +DEBUG: Entering PLUGIN_OneProcessExporter.edit_check_sa [model_handling.py at line 1288]  +DEBUG: Entering PLUGIN_OneProcessExporter.edit_mgonGPU [model_handling.py at line 1305]  +DEBUG: Entering PLUGIN_OneProcessExporter.edit_processidfile [model_handling.py at line 1325]  +DEBUG: Entering PLUGIN_OneProcessExporter.edit_testxxx [model_handling.py at line 1355]  +DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1366]  +DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1377]  +Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  +ALOHA: aloha starts to compute helicity amplitudes +ALOHA: aloha creates VVV1 set of routines with options: P0 +ALOHA: aloha creates FFV1 routines +ALOHA: aloha creates 2 routines in 0.126 s + VVV1 + FFV1 + FFV1 + FFV1 +FileWriter for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/. +super_write_set_parameters_onlyfixMajorana (hardcoded=False) +DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 617 , keys size = 617 [model_handling.py at line 716]  +DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 617 [model_handling.py at line 732]  +DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 617 , keys size = 617 [model_handling.py at line 733]  +super_write_set_parameters_onlyfixMajorana (hardcoded=True) +DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 3 , keys size = 3 [model_handling.py at line 716]  +DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 3 [model_handling.py at line 732]  +DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 3 , keys size = 3 [model_handling.py at line 733]  +DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 2 , keys size = 2 [model_handling.py at line 716]  +DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 2 [model_handling.py at line 732]  +DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 2 , keys size = 2 [model_handling.py at line 733]  +DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 3 , keys size = 3 [model_handling.py at line 716]  +DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 3 [model_handling.py at line 732]  +DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 3 , keys size = 3 [model_handling.py at line 733]  +DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 2 , keys size = 2 [model_handling.py at line 716]  +DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 2 [model_handling.py at line 732]  +DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 2 , keys size = 2 [model_handling.py at line 733]  +DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 2 , keys size = 2 [model_handling.py at line 716]  +DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 2 [model_handling.py at line 732]  +DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 2 , keys size = 2 [model_handling.py at line 733]  +FileWriter for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory +INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/. +DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  +quit + +real 0m1.264s +user 0m1.200s +sys 0m0.057s diff --git a/epochX/cudacpp/susy_gg_tt.sa/Cards/param_card.dat b/epochX/cudacpp/susy_gg_tt.sa/Cards/param_card.dat new file mode 100644 index 0000000000..16c221de5e --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/Cards/param_card.dat @@ -0,0 +1,492 @@ +###################################################################### +## PARAM_CARD AUTOMATICALY GENERATED BY MG5 FOLLOWING UFO MODEL #### +###################################################################### +## ## +## Width set on Auto will be computed following the information ## +## present in the decay.py files of the model. ## +## See arXiv:1402.1178 for more details. ## +## ## +###################################################################### + +################################### +## INFORMATION FOR DSQMIX +################################### +Block dsqmix + 1 1 1.000000e+00 # RRd1x1 + 2 2 1.000000e+00 # RRd2x2 + 3 3 9.387379e-01 # RRd3x3 + 3 6 3.446319e-01 # RRd3x6 + 4 4 1.000000e+00 # RRd4x4 + 5 5 1.000000e+00 # RRd5x5 + 6 3 -3.446319e-01 # RRd6x3 + 6 6 9.387379e-01 # RRd6x6 + +################################### +## INFORMATION FOR FRALPHA +################################### +Block fralpha + 1 -1.138252e-01 # alp + +################################### +## INFORMATION FOR HMIX +################################### +Block hmix + 1 3.576810e+02 # RMUH + 2 9.748624e+00 # tb + 4 1.664391e+05 # MA2 + +################################### +## INFORMATION FOR MASS +################################### +Block mass + 5 4.889917e+00 # MB + 6 1.750000e+02 # MT + 15 1.777000e+00 # Mta + 23 9.118760e+01 # MZ + 24 7.982901e+01 # MW + 25 1.108991e+02 # MH01 + 35 3.999601e+02 # MH02 + 36 3.995839e+02 # MA0 + 37 4.078790e+02 # MH + 1000001 5.684411e+02 # set of param :1*Msd1, 1*Msd2 + 1000002 5.611190e+02 # set of param :1*Msu1, 1*Msu2 + 1000005 5.130652e+02 # Msd3 + 1000006 3.996685e+02 # Msu3 + 1000011 2.029157e+02 # set of param :1*Msl1, 1*Msl2 + 1000012 1.852583e+02 # set of param :1*Msn1, 1*Msn2 + 1000015 1.344909e+02 # Msl3 + 1000016 1.847085e+02 # Msn3 + 1000021 6.077137e+02 # Mgo + 1000022 9.668807e+01 # Mneu1 + 1000023 1.810882e+02 # Mneu2 + 1000024 1.816965e+02 # Mch1 + 1000025 -3.637560e+02 # Mneu3 + 1000035 3.817294e+02 # Mneu4 + 1000037 3.799393e+02 # Mch2 + 2000001 5.452285e+02 # set of param :1*Msd4, 1*Msd5 + 2000002 5.492593e+02 # set of param :1*Msu4, 1*Msu5 + 2000005 5.437267e+02 # Msd6 + 2000006 5.857858e+02 # Msu6 + 2000011 1.441028e+02 # set of param :1*Msl4, 1*Msl5 + 2000015 2.068678e+02 # Msl6 +## Dependent parameters, given by model restrictions. +## Those values should be edited following the +## analytical expression. MG5 ignores those values +## but they are important for interfacing the output of MG5 +## to external program such as Pythia. + 1 0.000000e+00 # d : 0.0 + 2 0.000000e+00 # u : 0.0 + 3 0.000000e+00 # s : 0.0 + 4 0.000000e+00 # c : 0.0 + 11 0.000000e+00 # e- : 0.0 + 12 0.000000e+00 # ve : 0.0 + 13 0.000000e+00 # mu- : 0.0 + 14 0.000000e+00 # vm : 0.0 + 16 0.000000e+00 # vt : 0.0 + 21 0.000000e+00 # g : 0.0 + 22 0.000000e+00 # a : 0.0 + 1000014 1.852583e+02 # svm : Msn1 + 1000013 2.029157e+02 # mul- : Msl1 + 2000013 1.441028e+02 # mur- : Msl4 + 1000004 5.611190e+02 # cl : Msu1 + 2000004 5.492593e+02 # cr : Msu4 + 1000003 5.684411e+02 # sl : Msd1 + 2000003 5.452285e+02 # sr : Msd4 + +################################### +## INFORMATION FOR MSD2 +################################### +Block msd2 + 1 1 2.736847e+05 # set of param :1*RmD21x1, 1*RmD22x2 + 2 2 2.736847e+05 # MG5 will not use this value use instead 1*mdl_RmD21x1 + 3 3 2.702620e+05 # RmD23x3 + +################################### +## INFORMATION FOR MSE2 +################################### +Block mse2 + 1 1 1.863063e+04 # set of param :1*RmE21x1, 1*RmE22x2 + 2 2 1.863063e+04 # MG5 will not use this value use instead 1*mdl_RmE21x1 + 3 3 1.796764e+04 # RmE23x3 + +################################### +## INFORMATION FOR MSL2 +################################### +Block msl2 + 1 1 3.815567e+04 # set of param :1*RmL21x1, 1*RmL22x2 + 2 2 3.815567e+04 # MG5 will not use this value use instead 1*mdl_RmL21x1 + 3 3 3.782868e+04 # RmL23x3 + +################################### +## INFORMATION FOR MSOFT +################################### +Block msoft + 1 1.013965e+02 # RMx1 + 2 1.915042e+02 # RMx2 + 3 5.882630e+02 # RMx3 + 21 3.233749e+04 # mHd2 + 22 -1.288001e+05 # mHu2 + +################################### +## INFORMATION FOR MSQ2 +################################### +Block msq2 + 1 1 2.998367e+05 # set of param :1*RmQ21x1, 1*RmQ22x2 + 2 2 2.998367e+05 # MG5 will not use this value use instead 1*mdl_RmQ21x1 + 3 3 2.487654e+05 # RmQ23x3 + +################################### +## INFORMATION FOR MSU2 +################################### +Block msu2 + 1 1 2.803821e+05 # set of param :1*RmU21x1, 1*RmU22x2 + 2 2 2.803821e+05 # MG5 will not use this value use instead 1*mdl_RmU21x1 + 3 3 1.791371e+05 # RmU23x3 + +################################### +## INFORMATION FOR NMIX +################################### +Block nmix + 1 1 9.863644e-01 # RNN1x1 + 1 2 -5.311036e-02 # RNN1x2 + 1 3 1.464340e-01 # RNN1x3 + 1 4 -5.311861e-02 # RNN1x4 + 2 1 9.935054e-02 # RNN2x1 + 2 2 9.449493e-01 # RNN2x2 + 2 3 -2.698467e-01 # RNN2x3 + 2 4 1.561507e-01 # RNN2x4 + 3 1 -6.033880e-02 # RNN3x1 + 3 2 8.770049e-02 # RNN3x2 + 3 3 6.958775e-01 # RNN3x3 + 3 4 7.102270e-01 # RNN3x4 + 4 1 -1.165071e-01 # RNN4x1 + 4 2 3.107390e-01 # RNN4x2 + 4 3 6.492260e-01 # RNN4x3 + 4 4 -6.843778e-01 # RNN4x4 + +################################### +## INFORMATION FOR SELMIX +################################### +Block selmix + 1 1 1.000000e+00 # RRl1x1 + 2 2 1.000000e+00 # RRl2x2 + 3 3 2.824872e-01 # RRl3x3 + 3 6 9.592711e-01 # RRl3x6 + 4 4 1.000000e+00 # RRl4x4 + 5 5 1.000000e+00 # RRl5x5 + 6 3 9.592711e-01 # RRl6x3 + 6 6 -2.824872e-01 # RRl6x6 + +################################### +## INFORMATION FOR SMINPUTS +################################### +Block sminputs + 1 1.279340e+02 # aEWM1 + 3 1.180000e-01 # aS (Note that Parameter not used if you use a PDF set) + +################################### +## INFORMATION FOR SNUMIX +################################### +Block snumix + 1 1 1.000000e+00 # RRn1x1 + 2 2 1.000000e+00 # RRn2x2 + 3 3 1.000000e+00 # RRn3x3 + +################################### +## INFORMATION FOR TD +################################### +Block td + 3 3 -1.106937e+02 # Rtd3x3 + +################################### +## INFORMATION FOR TE +################################### +Block te + 3 3 -2.540197e+01 # Rte3x3 + +################################### +## INFORMATION FOR TU +################################### +Block tu + 3 3 -4.447525e+02 # Rtu3x3 + +################################### +## INFORMATION FOR UMIX +################################### +Block umix + 1 1 9.168349e-01 # RUU1x1 + 1 2 -3.992666e-01 # RUU1x2 + 2 1 3.992666e-01 # RUU2x1 + 2 2 9.168349e-01 # RUU2x2 + +################################### +## INFORMATION FOR UPMNS +################################### +Block upmns + 1 1 1.000000e+00 # RMNS1x1 + 2 2 1.000000e+00 # RMNS2x2 + 3 3 1.000000e+00 # RMNS3x3 + +################################### +## INFORMATION FOR USQMIX +################################### +Block usqmix + 1 1 1.000000e+00 # RRu1x1 + 2 2 1.000000e+00 # RRu2x2 + 3 3 5.536450e-01 # RRu3x3 + 3 6 8.327528e-01 # RRu3x6 + 4 4 1.000000e+00 # RRu4x4 + 5 5 1.000000e+00 # RRu5x5 + 6 3 8.327528e-01 # RRu6x3 + 6 6 -5.536450e-01 # RRu6x6 + +################################### +## INFORMATION FOR VCKM +################################### +Block vckm + 1 1 1.000000e+00 # RCKM1x1 + 2 2 1.000000e+00 # RCKM2x2 + 3 3 1.000000e+00 # RCKM3x3 + +################################### +## INFORMATION FOR VMIX +################################### +Block vmix + 1 1 9.725578e-01 # RVV1x1 + 1 2 -2.326612e-01 # RVV1x2 + 2 1 2.326612e-01 # RVV2x1 + 2 2 9.725578e-01 # RVV2x2 + +################################### +## INFORMATION FOR YD +################################### +Block yd + 3 3 1.388402e-01 # Ryd3x3 + +################################### +## INFORMATION FOR YE +################################### +Block ye + 3 3 1.008908e-01 # Rye3x3 + +################################### +## INFORMATION FOR YU +################################### +Block yu + 3 3 8.928445e-01 # Ryu3x3 + +################################### +## INFORMATION FOR DECAY +################################### +DECAY 6 1.561950e+00 # WT +DECAY 23 2.411433e+00 # WZ +DECAY 24 2.002822e+00 # WW +DECAY 25 1.986108e-03 # WH01 +DECAY 35 5.748014e-01 # WH02 +DECAY 36 6.321785e-01 # WA0 +DECAY 37 5.469628e-01 # WH +DECAY 1000001 5.312788e+00 # Wsd1 +DECAY 1000002 5.477195e+00 # Wsu1 +DECAY 1000003 5.312788e+00 # Wsd2 +DECAY 1000004 5.477195e+00 # Wsu2 +DECAY 1000005 3.736276e+00 # Wsd3 +DECAY 1000006 2.021596e+00 # Wsu3 +DECAY 1000011 2.136822e-01 # Wsl1 +DECAY 1000012 1.498816e-01 # Wsn1 +DECAY 1000013 2.136822e-01 # Wsl2 +DECAY 1000014 1.498816e-01 # Wsn2 +DECAY 1000015 1.483273e-01 # Wsl3 +DECAY 1000016 1.475190e-01 # Wsn3 +DECAY 1000021 5.506754e+00 # Wgo +DECAY 1000023 2.077700e-02 # Wneu2 +DECAY 1000024 1.704145e-02 # Wch1 +DECAY 1000025 1.915985e+00 # Wneu3 +DECAY 1000035 2.585851e+00 # Wneu4 +DECAY 1000037 2.486895e+00 # Wch2 +DECAY 2000001 2.858123e-01 # Wsd4 +DECAY 2000002 1.152973e+00 # Wsu4 +DECAY 2000003 2.858123e-01 # Wsd5 +DECAY 2000004 1.152973e+00 # Wsu5 +DECAY 2000005 8.015663e-01 # Wsd6 +DECAY 2000006 7.373133e+00 # Wsu6 +DECAY 2000011 2.161216e-01 # Wsl4 +DECAY 2000013 2.161216e-01 # Wsl5 +DECAY 2000015 2.699061e-01 # Wsl6 +## Dependent parameters, given by model restrictions. +## Those values should be edited following the +## analytical expression. MG5 ignores those values +## but they are important for interfacing the output of MG5 +## to external program such as Pythia. +DECAY 1 0.000000e+00 # d : 0.0 +DECAY 2 0.000000e+00 # u : 0.0 +DECAY 3 0.000000e+00 # s : 0.0 +DECAY 4 0.000000e+00 # c : 0.0 +DECAY 5 0.000000e+00 # b : 0.0 +DECAY 11 0.000000e+00 # e- : 0.0 +DECAY 12 0.000000e+00 # ve : 0.0 +DECAY 13 0.000000e+00 # mu- : 0.0 +DECAY 14 0.000000e+00 # vm : 0.0 +DECAY 15 0.000000e+00 # ta- : 0.0 +DECAY 16 0.000000e+00 # vt : 0.0 +DECAY 21 0.000000e+00 # g : 0.0 +DECAY 22 0.000000e+00 # a : 0.0 +DECAY 1000022 0.000000e+00 # n1 : 0.0 +#=========================================================== +# QUANTUM NUMBERS OF NEW STATE(S) (NON SM PDG CODE) +#=========================================================== + +Block QNUMBERS 1000022 # n1 + 1 0 # 3 times electric charge + 2 2 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 0 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000023 # n2 + 1 0 # 3 times electric charge + 2 2 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 0 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000025 # n3 + 1 0 # 3 times electric charge + 2 2 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 0 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000035 # n4 + 1 0 # 3 times electric charge + 2 2 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 0 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000024 # x1+ + 1 3 # 3 times electric charge + 2 2 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000037 # x2+ + 1 3 # 3 times electric charge + 2 2 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000021 # go + 1 0 # 3 times electric charge + 2 2 # number of spin states (2S+1) + 3 8 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 0 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 35 # h2 + 1 0 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 0 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 36 # h3 + 1 0 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 0 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 37 # h+ + 1 3 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000012 # sve + 1 0 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000014 # svm + 1 0 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000016 # svt + 1 0 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000011 # el- + 1 -3 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000013 # mul- + 1 -3 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000015 # ta1- + 1 -3 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 2000011 # er- + 1 -3 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 2000013 # mur- + 1 -3 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 2000015 # ta2- + 1 -3 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 1 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000002 # ul + 1 2 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000004 # cl + 1 2 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000006 # t1 + 1 2 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 2000002 # ur + 1 2 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 2000004 # cr + 1 2 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 2000006 # t2 + 1 2 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000001 # dl + 1 -1 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000003 # sl + 1 -1 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 1000005 # b1 + 1 -1 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 2000001 # dr + 1 -1 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 2000003 # sr + 1 -1 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) +Block QNUMBERS 2000005 # b2 + 1 -1 # 3 times electric charge + 2 1 # number of spin states (2S+1) + 3 3 # colour rep (1: singlet, 3: triplet, 8: octet) + 4 1 # Particle/Antiparticle distinction (0=own anti) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h new file mode 100644 index 0000000000..faa8f95d1d --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h @@ -0,0 +1,519 @@ +#ifndef BRIDGE_H +#define BRIDGE_H 1 + +// Includes from Cuda/C++ matrix element calculations +#include "mgOnGpuConfig.h" // for mgOnGpu::npar, mgOnGpu::np4 + +#include "CPPProcess.h" // for CPPProcess +#include "CrossSectionKernels.h" // for flagAbnormalMEs +#include "MatrixElementKernels.h" // for MatrixElementKernelHost, MatrixElementKernelDevice +#include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM +#include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + /** + * A base class for a class whose pointer is passed between Fortran and C++. + * This is not really necessary, but it allows minimal type checks on all such pointers. + */ + struct CppObjectInFortran + { + CppObjectInFortran() {} + virtual ~CppObjectInFortran() {} + }; + + //-------------------------------------------------------------------------- + /** + * A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow. + * The FORTRANFPTYPE template parameter indicates the precision of the Fortran momenta from MadEvent (float or double). + * The precision of the matrix element calculation is hardcoded in the fptype typedef in CUDA/C++. + * + * The Fortran momenta passed in are in the form of + * DOUBLE PRECISION P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) + * where the dimensions are , , . + * In memory, this is stored in a way that C reads as an array P_MULTI[nevtF][nparF][np4F]. + * The CUDA/C++ momenta are stored as an array[npagM][npar][np4][neppM] with nevt=npagM*neppM. + * The Bridge is configured to store nevt==nevtF events in CUDA/C++. + * It also checks that Fortran and C++ parameters match, nparF==npar and np4F==np4. + * + * The cpu/gpu sequences take FORTRANFPTYPE* (not fptype*) momenta/MEs. + * This allows mixing double in MadEvent Fortran with float in CUDA/C++ sigmaKin. + * In the fcheck_sa.f test, Fortran uses double while CUDA/C++ may use double or float. + * In the check_sa "--bridge" test, everything is implemented in fptype (double or float). + */ + template + class Bridge final : public CppObjectInFortran + { + public: + /** + * Constructor + * + * @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran array loops (VECSIZE_USED <= VECSIZE_MEMMAX) + * @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + */ + Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ); + + /** + * Destructor + */ + virtual ~Bridge() {} + + // Delete copy/move constructors and assignment operators + Bridge( const Bridge& ) = delete; + Bridge( Bridge&& ) = delete; + Bridge& operator=( const Bridge& ) = delete; + Bridge& operator=( Bridge&& ) = delete; + +#ifdef __CUDACC__ + /** + * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads + * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) + * + * @param gpublocks number of gpublocks + * @param gputhreads number of gputhreads + */ + void set_gpugrid( const int gpublocks, const int gputhreads ); + + /** + * Sequence to be executed for the Cuda matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant alphas) + * @param rndhel the pointer to the input random numbers for helicity selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0) + * @param mes the pointer to the output matrix elements + * @param goodHelOnly quit after computing good helicities? + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + */ + void gpu_sequence( const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int channelId, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool goodHelOnly = false ); +#else + /** + * Sequence to be executed for the vectorized CPU matrix element calculation + * + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant alphas) + * @param rndhel the pointer to the input random numbers for helicity selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelId the Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0) + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + * @param goodHelOnly quit after computing good helicities? + */ + void cpu_sequence( const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int channelId, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool goodHelOnly = false ); +#endif + + // Return the number of good helicities (-1 initially when they have not yet been calculated) + int nGoodHel() const { return m_nGoodHel; } + + // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) + constexpr int nTotHel() const { return mgOnGpu::ncomb; } + + private: + unsigned int m_nevt; // number of events + int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) + +#ifdef __CUDACC__ + int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) + int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) + mg5amcGpu::DeviceBuffer m_devMomentaF; + mg5amcGpu::DeviceBufferMomenta m_devMomentaC; + mg5amcGpu::DeviceBufferGs m_devGs; + mg5amcGpu::DeviceBufferRndNumHelicity m_devRndHel; + mg5amcGpu::DeviceBufferRndNumColor m_devRndCol; + mg5amcGpu::DeviceBufferMatrixElements m_devMEs; + mg5amcGpu::DeviceBufferSelectedHelicity m_devSelHel; + mg5amcGpu::DeviceBufferSelectedColor m_devSelCol; + mg5amcGpu::PinnedHostBufferGs m_hstGs; + mg5amcGpu::PinnedHostBufferRndNumHelicity m_hstRndHel; + mg5amcGpu::PinnedHostBufferRndNumColor m_hstRndCol; + mg5amcGpu::PinnedHostBufferMatrixElements m_hstMEs; + mg5amcGpu::PinnedHostBufferSelectedHelicity m_hstSelHel; + mg5amcGpu::PinnedHostBufferSelectedColor m_hstSelCol; + std::unique_ptr m_pmek; + //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) + static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) +#else + mg5amcCpu::HostBufferMomenta m_hstMomentaC; + mg5amcCpu::HostBufferGs m_hstGs; + mg5amcCpu::HostBufferRndNumHelicity m_hstRndHel; + mg5amcCpu::HostBufferRndNumColor m_hstRndCol; + mg5amcCpu::HostBufferMatrixElements m_hstMEs; + mg5amcCpu::HostBufferSelectedHelicity m_hstSelHel; + mg5amcCpu::HostBufferSelectedColor m_hstSelCol; + std::unique_ptr m_pmek; +#endif + }; + + //-------------------------------------------------------------------------- + // + // Forward declare transposition methods + // + +#ifdef __CUDACC__ + + template + __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); + +#endif // __CUDACC__ + + template + void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); + + template + void hst_transposeMomentaC2F( const Tin* in, Tout* out, const unsigned int nevt ); + + //-------------------------------------------------------------------------- + // + // Implementations of member functions of class Bridge + // + + template + Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) + : m_nevt( nevtF ) + , m_nGoodHel( -1 ) +#ifdef __CUDACC__ + , m_gputhreads( 256 ) // default number of gpu threads + , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads + , m_devMomentaF( m_nevt ) + , m_devMomentaC( m_nevt ) + , m_devGs( m_nevt ) + , m_devRndHel( m_nevt ) + , m_devRndCol( m_nevt ) + , m_devMEs( m_nevt ) + , m_devSelHel( m_nevt ) + , m_devSelCol( m_nevt ) +#else + , m_hstMomentaC( m_nevt ) +#endif + , m_hstGs( m_nevt ) + , m_hstRndHel( m_nevt ) + , m_hstRndCol( m_nevt ) + , m_hstMEs( m_nevt ) + , m_hstSelHel( m_nevt ) + , m_hstSelCol( m_nevt ) + , m_pmek( nullptr ) + { + if( nparF != mgOnGpu::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != mgOnGpu::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); +#ifdef __CUDACC__ + if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) + throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); + while( m_nevt != m_gpublocks * m_gputhreads ) + { + m_gputhreads /= 2; + if( m_gputhreads < s_gputhreadsmin ) + throw std::logic_error( "Bridge constructor: FIXME! cannot choose gputhreads" ); // this should never happen! + m_gpublocks = m_nevt / m_gputhreads; + } + std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; + mg5amcGpu::CPPProcess process( /*verbose=*/false ); + m_pmek.reset( new mg5amcGpu::MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); +#else + std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; + mg5amcCpu::CPPProcess process( /*verbose=*/false ); + m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // __CUDACC__ + process.initProc( "../../Cards/param_card.dat" ); + } + +#ifdef __CUDACC__ + template + void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) + { + if( m_nevt != gpublocks * gputhreads ) + throw std::runtime_error( "Bridge: gpublocks*gputhreads must equal m_nevt in set_gpugrid" ); + m_gpublocks = gpublocks; + m_gputhreads = gputhreads; + std::cout << "WARNING! Set grid in Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads + << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; + m_pmek->setGrid( m_gpublocks, m_gputhreads ); + } +#endif + +#ifdef __CUDACC__ + template + void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int channelId, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool goodHelOnly ) + { + constexpr int neppM = MemoryAccessMomenta::neppM; + if constexpr( neppM == 1 && std::is_same_v ) + { + checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + } + else + { + checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + const int thrPerEvt = mgOnGpu::npar * mgOnGpu::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) + //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower + dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + } + if constexpr( std::is_same_v ) + { + memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) ); + memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) ); + memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) ); + } + else + { + std::copy( gs, gs + m_nevt, m_hstGs.data() ); + std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() ); + std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); + } + copyDeviceFromHost( m_devGs, m_hstGs ); + copyDeviceFromHost( m_devRndHel, m_hstRndHel ); + copyDeviceFromHost( m_devRndCol, m_hstRndCol ); + if( m_nGoodHel < 0 ) + { + m_nGoodHel = m_pmek->computeGoodHelicities(); + if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge gpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + } + if( goodHelOnly ) return; + m_pmek->computeMatrixElements( channelId ); + copyHostFromDevice( m_hstMEs, m_devMEs ); + flagAbnormalMEs( m_hstMEs.data(), m_nevt ); + copyHostFromDevice( m_hstSelHel, m_devSelHel ); + copyHostFromDevice( m_hstSelCol, m_devSelCol ); + if constexpr( std::is_same_v ) + { + memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); + memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() ); + memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() ); + } + else + { + std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes ); + std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel ); + std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol ); + } + } +#endif + +#ifndef __CUDACC__ + template + void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int channelId, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol, + const bool goodHelOnly ) + { + hst_transposeMomentaF2C( momenta, m_hstMomentaC.data(), m_nevt ); + if constexpr( std::is_same_v ) + { + memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) ); + memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) ); + memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) ); + } + else + { + std::copy( gs, gs + m_nevt, m_hstGs.data() ); + std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() ); + std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() ); + } + if( m_nGoodHel < 0 ) + { + m_nGoodHel = m_pmek->computeGoodHelicities(); + if( m_nGoodHel < 0 ) throw std::runtime_error( "Bridge cpu_sequence: computeGoodHelicities returned nGoodHel<0" ); + } + if( goodHelOnly ) return; + m_pmek->computeMatrixElements( channelId ); + flagAbnormalMEs( m_hstMEs.data(), m_nevt ); + if constexpr( std::is_same_v ) + { + memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() ); + memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() ); + memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() ); + } + else + { + std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes ); + std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel ); + std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol ); + } + } +#endif + + //-------------------------------------------------------------------------- + // + // Implementations of transposition methods + // - FORTRAN arrays: P_MULTI(0:3, NEXTERNAL, VECSIZE_USED) ==> p_multi[nevtF][nparF][np4F] in C++ (AOS) + // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) + // + +#ifdef __CUDACC__ + template + __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) + { + constexpr bool oldImplementation = true; // default: use old implementation + if constexpr( oldImplementation ) + { + // SR initial implementation + constexpr int part = mgOnGpu::npar; + constexpr int mome = mgOnGpu::np4; + constexpr int strd = MemoryAccessMomenta::neppM; + int pos = blockDim.x * blockIdx.x + threadIdx.x; + int arrlen = nevt * part * mome; + if( pos < arrlen ) + { + int page_i = pos / ( strd * mome * part ); + int rest_1 = pos % ( strd * mome * part ); + int part_i = rest_1 / ( strd * mome ); + int rest_2 = rest_1 % ( strd * mome ); + int mome_i = rest_2 / strd; + int strd_i = rest_2 % strd; + int inpos = + ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + out[pos] = in[inpos]; // F2C (Fortran to C) + } + } + else + { + // AV attempt another implementation with 1 event per thread: this seems slower... + // F-style: AOS[nevtF][nparF][np4F] + // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + constexpr int npar = mgOnGpu::npar; + constexpr int np4 = mgOnGpu::np4; + constexpr int neppM = MemoryAccessMomenta::neppM; + assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + int ievt = blockDim.x * blockIdx.x + threadIdx.x; + int ipagM = ievt / neppM; + int ieppM = ievt % neppM; + for( int ip4 = 0; ip4 < np4; ip4++ ) + for( int ipar = 0; ipar < npar; ipar++ ) + { + int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + int fpos = ievt * npar * np4 + ipar * np4 + ip4; + out[cpos] = in[fpos]; // F2C (Fortran to C) + } + } + } +#endif + + template + void hst_transposeMomenta( const Tin* in, Tout* out, const unsigned int nevt ) + { + constexpr bool oldImplementation = false; // default: use new implementation + if constexpr( oldImplementation ) + { + // SR initial implementation + constexpr unsigned int part = mgOnGpu::npar; + constexpr unsigned int mome = mgOnGpu::np4; + constexpr unsigned int strd = MemoryAccessMomenta::neppM; + unsigned int arrlen = nevt * part * mome; + for( unsigned int pos = 0; pos < arrlen; ++pos ) + { + unsigned int page_i = pos / ( strd * mome * part ); + unsigned int rest_1 = pos % ( strd * mome * part ); + unsigned int part_i = rest_1 / ( strd * mome ); + unsigned int rest_2 = rest_1 % ( strd * mome ); + unsigned int mome_i = rest_2 / strd; + unsigned int strd_i = rest_2 % strd; + unsigned int inpos = + ( page_i * strd + strd_i ) // event number + * ( part * mome ) // event size (pos of event) + + part_i * mome // particle inside event + + mome_i; // momentum inside particle + if constexpr( F2C ) // needs c++17 and cuda >=11.2 (#333) + out[pos] = in[inpos]; // F2C (Fortran to C) + else + out[inpos] = in[pos]; // C2F (C to Fortran) + } + } + else + { + // AV attempt another implementation: this is slightly faster (better c++ pipelining?) + // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] + // F-style: AOS[nevtF][nparF][np4F] + // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM + constexpr unsigned int npar = mgOnGpu::npar; + constexpr unsigned int np4 = mgOnGpu::np4; + constexpr unsigned int neppM = MemoryAccessMomenta::neppM; + if constexpr( neppM == 1 && std::is_same_v ) + { + memcpy( out, in, nevt * npar * np4 * sizeof( Tin ) ); + } + else + { + const unsigned int npagM = nevt / neppM; + assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? + for( unsigned int ipagM = 0; ipagM < npagM; ipagM++ ) + for( unsigned int ip4 = 0; ip4 < np4; ip4++ ) + for( unsigned int ipar = 0; ipar < npar; ipar++ ) + for( unsigned int ieppM = 0; ieppM < neppM; ieppM++ ) + { + unsigned int ievt = ipagM * neppM + ieppM; + unsigned int cpos = ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM; + unsigned int fpos = ievt * npar * np4 + ipar * np4 + ip4; + if constexpr( F2C ) + out[cpos] = in[fpos]; // F2C (Fortran to C) + else + out[fpos] = in[cpos]; // C2F (C to Fortran) + } + } + } + } + + template + void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) + { + constexpr bool F2C = true; + hst_transposeMomenta( in, out, nevt ); + } + + template + void hst_transposeMomentaC2F( const Tin* in, Tout* out, const unsigned int nevt ) + { + constexpr bool F2C = false; + hst_transposeMomenta( in, out, nevt ); + } + + //-------------------------------------------------------------------------- +} +#endif // BRIDGE_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc new file mode 100644 index 0000000000..c2c16ff038 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc @@ -0,0 +1,149 @@ +#include "BridgeKernels.h" + +#include "MemoryAccessMomenta.h" + +#include + +using mgOnGpu::npar; // the number of particles (external = initial + final) +using mgOnGpu::np4; // the number of dimensions of 4-momenta (E,px,py,pz) + +//============================================================================ + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + BridgeKernelBase::BridgeKernelBase( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_bridge( nevt, npar, np4 ) + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "BridgeKernelBase: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "BridgeKernelBase: matrixElements must be a host array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "BridgeKernelBase: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "BridgeKernelBase: nevt mismatch with matrixElements" ); + } + + //-------------------------------------------------------------------------- +} + +//============================================================================ + +#ifndef __CUDACC__ +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + BridgeKernelHost::BridgeKernelHost( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: Gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ) + : BridgeKernelBase( momenta, gs, rndhel, rndcol, matrixElements, selhel, selcol, nevt ) + , m_fortranMomenta( nevt ) + { + } + + //-------------------------------------------------------------------------- + + void BridgeKernelHost::transposeInputMomentaC2F() + { + hst_transposeMomentaC2F( m_momenta.data(), m_fortranMomenta.data(), nevt() ); + } + + //-------------------------------------------------------------------------- + + int BridgeKernelHost::computeGoodHelicities() + { + constexpr bool goodHelOnly = true; + constexpr unsigned int channelId = 0; // disable multi-channel for helicity filtering + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), channelId, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + return m_bridge.nGoodHel(); + } + + //-------------------------------------------------------------------------- + + void BridgeKernelHost::computeMatrixElements( const unsigned int channelId ) + { + constexpr bool goodHelOnly = false; + m_bridge.cpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), channelId, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + } + + //-------------------------------------------------------------------------- + +} +#endif + +//============================================================================ + +#ifdef __CUDACC__ +namespace mg5amcGpu +{ + + //-------------------------------------------------------------------------- + + BridgeKernelDevice::BridgeKernelDevice( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: Gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t gpublocks, + const size_t gputhreads ) + : BridgeKernelBase( momenta, gs, rndhel, rndcol, matrixElements, selhel, selcol, gpublocks * gputhreads ) + , m_fortranMomenta( nevt() ) + , m_gpublocks( gpublocks ) + , m_gputhreads( gputhreads ) + { + if( m_gpublocks == 0 ) throw std::runtime_error( "BridgeKernelDevice: gpublocks must be > 0" ); + if( m_gputhreads == 0 ) throw std::runtime_error( "BridgeKernelDevice: gputhreads must be > 0" ); + m_bridge.set_gpugrid( gpublocks, gputhreads ); + } + + //-------------------------------------------------------------------------- + + void BridgeKernelDevice::transposeInputMomentaC2F() + { + hst_transposeMomentaC2F( m_momenta.data(), m_fortranMomenta.data(), nevt() ); + } + + //-------------------------------------------------------------------------- + + int BridgeKernelDevice::computeGoodHelicities() + { + constexpr bool goodHelOnly = true; + constexpr unsigned int channelId = 0; // disable multi-channel for helicity filtering + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), channelId, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + return m_bridge.nGoodHel(); + } + + //-------------------------------------------------------------------------- + + void BridgeKernelDevice::computeMatrixElements( const unsigned int channelId ) + { + constexpr bool goodHelOnly = false; + m_bridge.gpu_sequence( m_fortranMomenta.data(), m_gs.data(), m_rndhel.data(), m_rndcol.data(), channelId, m_matrixElements.data(), m_selhel.data(), m_selcol.data(), goodHelOnly ); + } + + //-------------------------------------------------------------------------- + +} +#endif + +//============================================================================ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.h new file mode 100644 index 0000000000..10e664a4c4 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.h @@ -0,0 +1,134 @@ +#ifndef BRIDGEKERNELS_H +#define BRIDGEKERNELS_H 1 + +#include "mgOnGpuConfig.h" + +#include "Bridge.h" +#include "MatrixElementKernels.h" +#include "MemoryBuffers.h" + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + // A Bridge wrapper base class encapsulating matrix element calculations on a CPU host + class BridgeKernelBase : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + BridgeKernelBase( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~BridgeKernelBase() {} + + // Transpose input momenta from C to Fortran before the matrix element calculation in the Bridge + virtual void transposeInputMomentaC2F() = 0; + + protected: + + // The wrapped bridge + Bridge m_bridge; + }; + + //-------------------------------------------------------------------------- + +#ifndef __CUDACC__ + // A Bridge wrapper class encapsulating matrix element calculations on a CPU host + class BridgeKernelHost final : public BridgeKernelBase + { + public: + + // Constructor from existing input and output buffers + BridgeKernelHost( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~BridgeKernelHost() {} + + // Transpose input momenta from C to Fortran before the matrix element calculation in the Bridge + void transposeInputMomentaC2F() override final; + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const unsigned int channelId ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + private: + + // The buffer for the input momenta, transposed to Fortran array indexing + HostBufferMomenta m_fortranMomenta; + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + // A Bridge wrapper class encapsulating matrix element calculations on a GPU device + class BridgeKernelDevice : public BridgeKernelBase + { + public: + + // Constructor from existing input and output buffers + BridgeKernelDevice( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t gpublocks, + const size_t gputhreads ); + + // Destructor + virtual ~BridgeKernelDevice() {} + + // Transpose input momenta from C to Fortran before the matrix element calculation in the Bridge + void transposeInputMomentaC2F() override final; + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const unsigned int channelId ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return true; } + + private: + + // The buffer for the input momenta, transposed to Fortran array indexing + PinnedHostBufferMomenta m_fortranMomenta; + + // The number of blocks in the GPU grid + size_t m_gpublocks; + + // The number of threads in the GPU grid + size_t m_gputhreads; + }; +#endif + + //-------------------------------------------------------------------------- +} +#endif // BRIDGEKERNELS_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CMakeLists.txt b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CMakeLists.txt new file mode 100644 index 0000000000..1e15f3e9ed --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CMakeLists.txt @@ -0,0 +1,4 @@ +SUBDIRLIST(SUBDIRS) +FOREACH(subdir ${SUBDIRS}) + ADD_SUBDIRECTORY(${subdir}) +ENDFOREACH() diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.cc new file mode 100644 index 0000000000..398f8a87bd --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.cc @@ -0,0 +1,231 @@ +#include "CrossSectionKernels.h" + +#include "MemoryAccessMatrixElements.h" +#include "MemoryAccessWeights.h" +#include "MemoryBuffers.h" + +#include + +// ****************************************************************************************** +// *** NB: Disabling fast math is essential here, otherwise results are undefined *** +// *** NB: This file CrossSectionKernels.cc IS BUILT WITH -fno-fast-math in the Makefile! *** +// *** NB: Attempts with __attribute__((optimize("-fno-fast-math"))) were unsatisfactory *** +// ****************************************************************************************** + +inline bool +fp_is_nan( const fptype& fp ) +{ + //#pragma clang diagnostic push + //#pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) + return std::isnan( fp ); // always false for clang in fast math mode (tautological compare)? + //#pragma clang diagnostic pop +} + +inline bool +fp_is_abnormal( const fptype& fp ) +{ + if( fp_is_nan( fp ) ) return true; + if( fp != fp ) return true; + return false; +} + +inline bool +fp_is_zero( const fptype& fp ) +{ + if( fp == 0 ) return true; + return false; +} + +// See https://en.cppreference.com/w/cpp/numeric/math/FP_categories +inline const char* +fp_show_class( const fptype& fp ) +{ + switch( std::fpclassify( fp ) ) + { + case FP_INFINITE: return "Inf"; + case FP_NAN: return "NaN"; + case FP_NORMAL: return "normal"; + case FP_SUBNORMAL: return "subnormal"; + case FP_ZERO: return "zero"; + default: return "unknown"; + } +} + +inline void +debug_me_is_abnormal( const fptype& me, size_t ievtALL ) +{ + std::cout << "DEBUG[" << ievtALL << "]" + << " ME=" << me + << " fpisabnormal=" << fp_is_abnormal( me ) + << " fpclass=" << fp_show_class( me ) + << " (me==me)=" << ( me == me ) + << " (me==me+1)=" << ( me == me + 1 ) + << " isnan=" << fp_is_nan( me ) + << " isfinite=" << std::isfinite( me ) + << " isnormal=" << std::isnormal( me ) + << " is0=" << ( me == 0 ) + << " is1=" << ( me == 1 ) + << " abs(ME)=" << std::abs( me ) + << " isnan=" << fp_is_nan( std::abs( me ) ) + << std::endl; +} + +//============================================================================ + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + void flagAbnormalMEs( fptype* hstMEs, unsigned int nevt ) + { + for( unsigned int ievt = 0; ievt < nevt; ievt++ ) + { + if( fp_is_abnormal( hstMEs[ievt] ) ) + { + std::cout << "WARNING! flagging abnormal ME for ievt=" << ievt << std::endl; + hstMEs[ievt] = std::sqrt( -1. ); + } + } + } + + //-------------------------------------------------------------------------- + + CrossSectionKernelHost::CrossSectionKernelHost( const BufferWeights& samplingWeights, // input: sampling weights + const BufferMatrixElements& matrixElements, // input: matrix elements + EventStatistics& stats, // output: event statistics + const size_t nevt ) + : CrossSectionKernelBase( samplingWeights, matrixElements, stats ) + , NumberOfEvents( nevt ) + { + if( m_samplingWeights.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelHost: samplingWeights must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelHost: matrixElements must be a host array" ); + if( this->nevt() != m_samplingWeights.nevt() ) throw std::runtime_error( "CrossSectionKernelHost: nevt mismatch with samplingWeights" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "CrossSectionKernelHost: nevt mismatch with matrixElements" ); + } + + //-------------------------------------------------------------------------- + + void CrossSectionKernelHost::updateEventStatistics( const bool debug ) + { + EventStatistics stats; // new statistics for the new nevt events + // FIRST PASS: COUNT ALL/ABN/ZERO EVENTS, COMPUTE MIN/MAX, COMPUTE REFS AS MEANS OF SIMPLE SUMS + for( size_t ievt = 0; ievt < nevt(); ++ievt ) // Loop over all events in this iteration + { + const fptype& me = MemoryAccessMatrixElements::ieventAccessConst( m_matrixElements.data(), ievt ); + const fptype& wg = MemoryAccessWeights::ieventAccessConst( m_samplingWeights.data(), ievt ); + const size_t ievtALL = m_iter * nevt() + ievt; + // The following events are abnormal in a run with "-p 2048 256 12 -d" + // - check.exe/commonrand: ME[310744,451171,3007871,3163868,4471038,5473927] with fast math + // - check.exe/curand: ME[578162,1725762,2163579,5407629,5435532,6014690] with fast math + // - gcheck.exe/curand: ME[596016,1446938] with fast math + // Debug NaN/abnormal issues + //if ( ievtALL == 310744 ) // this ME is abnormal both with and without fast math + // debug_me_is_abnormal( me, ievtALL ); + //if ( ievtALL == 5473927 ) // this ME is abnormal only with fast math + // debug_me_is_abnormal( me, ievtALL ); + stats.nevtALL++; + if( fp_is_abnormal( me ) ) + { + if( debug ) // only printed out with "-p -d" (matrixelementALL is not filled without -p) + std::cout << "WARNING! ME[" << ievtALL << "] is NaN/abnormal" << std::endl; + stats.nevtABN++; + continue; + } + if( fp_is_zero( me ) ) stats.nevtZERO++; + stats.minME = std::min( stats.minME, (double)me ); + stats.maxME = std::max( stats.maxME, (double)me ); + stats.minWG = std::min( stats.minWG, (double)wg ); + stats.maxWG = std::max( stats.maxWG, (double)wg ); + stats.sumMEdiff += me; // NB stats.refME is 0 here + stats.sumWGdiff += wg; // NB stats.refWG is 0 here + } + stats.refME = stats.meanME(); // draft ref + stats.refWG = stats.meanWG(); // draft ref + stats.sumMEdiff = 0; + stats.sumWGdiff = 0; + // SECOND PASS: IMPROVE MEANS FROM SUMS OF DIFFS TO PREVIOUS REF, UPDATE REF + for( size_t ievt = 0; ievt < nevt(); ++ievt ) // Loop over all events in this iteration + { + const fptype& me = MemoryAccessMatrixElements::ieventAccessConst( m_matrixElements.data(), ievt ); + const fptype& wg = MemoryAccessWeights::ieventAccessConst( m_samplingWeights.data(), ievt ); + if( fp_is_abnormal( me ) ) continue; + stats.sumMEdiff += ( me - stats.refME ); + stats.sumWGdiff += ( wg - stats.refWG ); + } + stats.refME = stats.meanME(); // final ref + stats.refWG = stats.meanWG(); // final ref + stats.sumMEdiff = 0; + stats.sumWGdiff = 0; + // THIRD PASS: COMPUTE STDDEV FROM SQUARED SUMS OF DIFFS TO REF + for( size_t ievt = 0; ievt < nevt(); ++ievt ) // Loop over all events in this iteration + { + const fptype& me = MemoryAccessMatrixElements::ieventAccessConst( m_matrixElements.data(), ievt ); + const fptype& wg = MemoryAccessWeights::ieventAccessConst( m_samplingWeights.data(), ievt ); + if( fp_is_abnormal( me ) ) continue; + stats.sqsMEdiff += std::pow( me - stats.refME, 2 ); + stats.sqsWGdiff += std::pow( wg - stats.refWG, 2 ); + } + // FOURTH PASS: UPDATE THE OVERALL STATS BY ADDING THE NEW STATS + m_stats += stats; + // Increment the iterations counter + m_iter++; + } + + //-------------------------------------------------------------------------- +} + +//============================================================================ + +#ifdef __CUDACC__ +namespace mg5amcGpu +{ + + /* + //-------------------------------------------------------------------------- + + CrossSectionKernelDevice::CrossSectionKernelDevice( const BufferWeights& samplingWeights, // input: sampling weights + const BufferMatrixElements& matrixElements, // input: matrix elements + EventStatistics& stats, // output: event statistics + const size_t gpublocks, + const size_t gputhreads ) + : CrossSectionKernelBase( samplingWeights, matrixElements, stats ) + , NumberOfEvents( gpublocks*gputhreads ) + , m_gpublocks( gpublocks ) + , m_gputhreads( gputhreads ) + { + if ( ! m_samplingWeights.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelDevice: samplingWeights must be a device array" ); + if ( ! m_matrixElements.isOnDevice() ) throw std::runtime_error( "CrossSectionKernelDevice: matrixElements must be a device array" ); + if ( m_gpublocks == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gpublocks must be > 0" ); + if ( m_gputhreads == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gputhreads must be > 0" ); + if ( this->nevt() != m_samplingWeights.nevt() ) throw std::runtime_error( "CrossSectionKernelDevice: nevt mismatch with samplingWeights" ); + if ( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "CrossSectionKernelDevice: nevt mismatch with matrixElements" ); + } + + //-------------------------------------------------------------------------- + + void CrossSectionKernelDevice::setGrid( const size_t gpublocks, const size_t gputhreads ) + { + if ( m_gpublocks == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gpublocks must be > 0 in setGrid" ); + if ( m_gputhreads == 0 ) throw std::runtime_error( "CrossSectionKernelDevice: gputhreads must be > 0 in setGrid" ); + if ( this->nevt() != m_gpublocks * m_gputhreads ) throw std::runtime_error( "CrossSectionKernelDevice: nevt mismatch in setGrid" ); + } + + //-------------------------------------------------------------------------- + + void CrossSectionKernelDevice::updateEventStatistics( const bool debug ) + { + // Increment the iterations counter + m_iter++; + } + + //-------------------------------------------------------------------------- + */ + +} +#endif + +//============================================================================ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.h new file mode 100644 index 0000000000..6098157b4e --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.h @@ -0,0 +1,133 @@ +#ifndef CROSSSECTIONKERNELS_H +#define CROSSSECTIONKERNELS_H 1 + +#include "mgOnGpuConfig.h" + +#include "EventStatistics.h" +#include "MemoryBuffers.h" + +//============================================================================ + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + // Helper function for Bridge.h: must be compiled without fast math + // Iterate through all output MEs and replace any NaN/abnormal ones by sqrt(-1) + void flagAbnormalMEs( fptype* hstMEs, unsigned int nevt ); + + //-------------------------------------------------------------------------- + + // A base class encapsulating the calculation of event statistics on a CPU host or on a GPU device + class CrossSectionKernelBase //: virtual public ICrossSectionKernel + { + protected: + + // Constructor from existing input and output buffers + CrossSectionKernelBase( const BufferWeights& samplingWeights, // input: sampling weights + const BufferMatrixElements& matrixElements, // input: matrix elements + EventStatistics& stats ) // output: event statistics + : m_samplingWeights( samplingWeights ) + , m_matrixElements( matrixElements ) + , m_stats( stats ) + , m_iter( 0 ) + { + // NB: do not initialise EventStatistics (you may be asked to update an existing result) + } + + public: + + // Destructor + virtual ~CrossSectionKernelBase() {} + + // Update event statistics + virtual void updateEventStatistics( const bool debug = false ) = 0; + + // Is this a host or device kernel? + virtual bool isOnDevice() const = 0; + + protected: + + // The buffer for the sampling weights + const BufferWeights& m_samplingWeights; + + // The buffer for the output matrix elements + const BufferMatrixElements& m_matrixElements; + + // The event statistics + EventStatistics& m_stats; + + // The number of iterations processed so far + size_t m_iter; + }; + + //-------------------------------------------------------------------------- + + // A class encapsulating the calculation of event statistics on a CPU host + class CrossSectionKernelHost final : public CrossSectionKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + CrossSectionKernelHost( const BufferWeights& samplingWeights, // input: sampling weights + const BufferMatrixElements& matrixElements, // input: matrix elements + EventStatistics& stats, // output: event statistics + const size_t nevt ); + + // Destructor + virtual ~CrossSectionKernelHost() {} + + // Update event statistics + void updateEventStatistics( const bool debug = false ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + }; + + //-------------------------------------------------------------------------- + + /* +#ifdef __CUDACC__ + // A class encapsulating the calculation of event statistics on a GPU device + class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + CrossSectionKernelDevice( const BufferWeights& samplingWeights, // input: sampling weights + const BufferMatrixElements& matrixElements, // input: matrix elements + EventStatistics& stats, // output: event statistics + const size_t gpublocks, + const size_t gputhreads ); + + // Destructor + virtual ~CrossSectionKernelDevice(){} + + // Reset gpublocks and gputhreads + void setGrid( const size_t gpublocks, const size_t gputhreads ); + + // Update event statistics + void updateEventStatistics( const bool debug=false ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return true; } + + private: + + // The number of blocks in the GPU grid + size_t m_gpublocks; + + // The number of threads in the GPU grid + size_t m_gputhreads; + + }; +#endif + */ + + //-------------------------------------------------------------------------- +} +#endif // CROSSSECTIONKERNELS_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CudaRuntime.h new file mode 100644 index 0000000000..e16ed2c703 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CudaRuntime.h @@ -0,0 +1,80 @@ +#ifndef MG5AMC_CUDARUNTIME_H +#define MG5AMC_CUDARUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef __CUDACC__ /* clang-format off */ +#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } +inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +{ + if( code != cudaSuccess ) + { + printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); + if( abort ) assert( code == cudaSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ +namespace mg5amcGpu +{ + // Instantiate a CudaRuntime at the beginnining of the application's main to + // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct CudaRuntime final + { + CudaRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~CudaRuntime() { tearDown( m_debug ); } + CudaRuntime( const CudaRuntime& ) = delete; + CudaRuntime( CudaRuntime&& ) = delete; + CudaRuntime& operator=( const CudaRuntime& ) = delete; + CudaRuntime& operator=( CudaRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; + checkCuda( cudaSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; + checkCuda( cudaDeviceReset() ); + } + }; + +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/EventStatistics.h new file mode 100644 index 0000000000..19c5199bcc --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/EventStatistics.h @@ -0,0 +1,160 @@ +#ifndef EventStatistics_H +#define EventStatistics_H 1 + +#include "mgOnGpuConfig.h" // for npar (meGeVexponent) + +#include +#include +#include +#include +#include + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + // The EventStatistics struct is used to accumulate running aggregates of event statistics. + // This will eventually include the process cross section and the process maximum weight: + // one important case of EventStatistics will then be the "gridpack" result set, which is + // the output of the "integration" step and the input to "unweighted event generation" step. + // The current implementation only includes statistics for matrix elements (ME) and sampling weights (WG); + // in first approximation, the process cross section and maximum weight are just the mean ME and maximum ME, + // but eventually the sampling weights WG (e.g. from Rambo) must also be taken into account in the calculation. + // The implementation uses differences to reference values to improve numerical precision. + struct EventStatistics + { + public: + size_t nevtALL; // total number of events used + size_t nevtABN; // number of events used, where ME is abnormal (nevtABN <= nevtALL) + size_t nevtZERO; // number of not-abnormal events used, where ME is zero (nevtZERO <= nevtOK) + double minME; // minimum matrix element + double maxME; // maximum matrix element + double minWG; // minimum sampling weight + double maxWG; // maximum sampling weight + double refME; // "reference" matrix element (normally the current mean) + double refWG; // "reference" sampling weight (normally the current mean) + double sumMEdiff; // sum of diff to ref for matrix element + double sumWGdiff; // sum of diff to ref for sampling weight + double sqsMEdiff; // squared sum of diff to ref for matrix element + double sqsWGdiff; // squared sum of diff to ref for sampling weight + std::string tag; // a text tag for printouts + // Number of events used, where ME is not abnormal + size_t nevtOK() const { return nevtALL - nevtABN; } + // Mean matrix element + // [x = ref+d => mean(x) = sum(x)/n = ref+sum(d)/n] + double meanME() const + { + return refME + ( nevtOK() > 0 ? sumMEdiff / nevtOK() : 0 ); + } + // Mean sampling weight + // [x = ref+d => mean(x) = sum(x)/n = ref+sum(d)/n] + double meanWG() const + { + return refWG + ( nevtOK() > 0 ? sumWGdiff / nevtOK() : 0 ); + } + // Variance matrix element + // [x = ref+d => n*var(x) = sum((x-mean(x))^2) = sum((ref+d-ref-sum(d)/n)^2) = sum((d-sum(d)/n)^2)/n = sum(d^2)-(sum(d))^2/n] + double varME() const { return ( sqsMEdiff - std::pow( sumMEdiff, 2 ) / nevtOK() ) / nevtOK(); } + // Variance sampling weight + // [x = ref+d => n*var(x) = sum((x-mean(x))^2) = sum((ref+d-ref-sum(d)/n)^2) = sum((d-sum(d)/n)^2)/n = sum(d^2)-(sum(d))^2/n] + double varWG() const { return ( sqsWGdiff - std::pow( sumWGdiff, 2 ) / nevtOK() ) / nevtOK(); } + // Standard deviation matrix element + double stdME() const { return std::sqrt( varME() ); } + // Standard deviation sampling weight + double stdWG() const { return std::sqrt( varWG() ); } + // Update reference matrix element + void updateRefME( const double newRef ) + { + const double deltaRef = refME - newRef; + sqsMEdiff += deltaRef * ( 2 * sumMEdiff + nevtOK() * deltaRef ); + sumMEdiff += deltaRef * nevtOK(); + refME = newRef; + } + // Update reference sampling weight + void updateRefWG( const double newRef ) + { + const double deltaRef = refWG - newRef; + sqsWGdiff += deltaRef * ( 2 * sumWGdiff + nevtOK() * deltaRef ); + sumWGdiff += deltaRef * nevtOK(); + refWG = newRef; + } + // Constructor + EventStatistics() + : nevtALL( 0 ) + , nevtABN( 0 ) + , nevtZERO( 0 ) + , minME( std::numeric_limits::max() ) + , maxME( std::numeric_limits::lowest() ) + , minWG( std::numeric_limits::max() ) + , maxWG( std::numeric_limits::lowest() ) + , refME( 0 ) + , refWG( 0 ) + , sumMEdiff( 0 ) + , sumWGdiff( 0 ) + , sqsMEdiff( 0 ) + , sqsWGdiff( 0 ) + , tag( "" ) {} + // Combine two EventStatistics + EventStatistics& operator+=( const EventStatistics& stats ) + { + EventStatistics s1 = *this; // temporary copy + EventStatistics s2 = stats; // temporary copy + EventStatistics& sum = *this; + sum.nevtALL = s1.nevtALL + s2.nevtALL; + sum.nevtABN = s1.nevtABN + s2.nevtABN; + sum.nevtZERO = s1.nevtZERO + s2.nevtZERO; + sum.minME = std::min( s1.minME, s2.minME ); + sum.maxME = std::max( s1.maxME, s2.maxME ); + sum.minWG = std::min( s1.minWG, s2.minWG ); + sum.maxWG = std::max( s1.maxWG, s2.maxWG ); + sum.refME = ( s1.meanME() * s1.nevtOK() + s2.meanME() * s2.nevtOK() ) / sum.nevtOK(); // new mean ME + s1.updateRefME( sum.refME ); + s2.updateRefME( sum.refME ); + sum.sumMEdiff = s1.sumMEdiff + s2.sumMEdiff; + sum.sqsMEdiff = s1.sqsMEdiff + s2.sqsMEdiff; + sum.refWG = ( s1.meanWG() * s1.nevtOK() + s2.meanWG() * s2.nevtOK() ) / sum.nevtOK(); // new mean WG + s1.updateRefWG( sum.refWG ); + s2.updateRefWG( sum.refWG ); + sum.sumWGdiff = s1.sumWGdiff + s2.sumWGdiff; + sum.sqsWGdiff = s1.sqsWGdiff + s2.sqsWGdiff; + return sum; + } + // Printout + void printout( std::ostream& out ) const + { + const EventStatistics& s = *this; + constexpr int meGeVexponent = -( 2 * mgOnGpu::npar - 8 ); + out << s.tag << "NumMatrixElems(notAbnormal) = " << s.nevtOK() << std::endl + << std::scientific // fixed format: affects all floats (default precision: 6) + << s.tag << "MeanMatrixElemValue = ( " << s.meanME() + << " +- " << s.stdME() / std::sqrt( s.nevtOK() ) << " ) GeV^" << meGeVexponent << std::endl // standard error + << s.tag << "[Min,Max]MatrixElemValue = [ " << s.minME + << " , " << s.maxME << " ] GeV^" << meGeVexponent << std::endl + << s.tag << "StdDevMatrixElemValue = ( " << s.stdME() + << std::string( 16, ' ' ) << " ) GeV^" << meGeVexponent << std::endl + << s.tag << "MeanWeight = ( " << s.meanWG() + << " +- " << s.stdWG() / std::sqrt( s.nevtOK() ) << std::endl // standard error + << s.tag << "[Min,Max]Weight = [ " << s.minWG + << " , " << s.maxWG << " ]" << std::endl + << s.tag << "StdDevWeight = ( " << s.stdWG() + << std::string( 16, ' ' ) << " )" << std::endl + << std::defaultfloat; // default format: affects all floats + } + }; + + //-------------------------------------------------------------------------- + + inline std::ostream& operator<<( std::ostream& out, const EventStatistics& s ) + { + s.printout( out ); + return out; + } + + //-------------------------------------------------------------------------- +} + +#endif // EventStatistics_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MadgraphTest.h new file mode 100644 index 0000000000..2a0be47978 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MadgraphTest.h @@ -0,0 +1,300 @@ +// Stephan Hageboeck, CERN, 12/2020 +#ifndef MADGRAPHTEST_H_ +#define MADGRAPHTEST_H_ 1 + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace +{ + + struct ReferenceData + { + std::vector>> momenta; + std::vector MEs; + }; + + /// Read batches of reference data from a file and store them in a map. + std::map readReferenceData( const std::string& refFileName ) + { + std::ifstream referenceFile( refFileName.c_str() ); + EXPECT_TRUE( referenceFile.is_open() ) << refFileName; + std::map referenceData; + unsigned int evtNo; + unsigned int batchNo; + for( std::string line; std::getline( referenceFile, line ); ) + { + std::stringstream lineStr( line ); + if( line.empty() || line[0] == '#' ) + { + continue; + } + else if( line.find( "Event" ) != std::string::npos ) + { + std::string dummy; + lineStr >> dummy >> evtNo >> dummy >> batchNo; + } + else if( line.find( "ME" ) != std::string::npos ) + { + if( evtNo <= referenceData[batchNo].MEs.size() ) + referenceData[batchNo].MEs.resize( evtNo + 1 ); + + std::string dummy; + lineStr >> dummy >> referenceData[batchNo].MEs[evtNo]; + } + else + { + unsigned int particleIndex; + lineStr >> particleIndex; + if( evtNo <= referenceData[batchNo].momenta.size() ) + referenceData[batchNo].momenta.resize( evtNo + 1 ); + if( particleIndex <= referenceData[batchNo].momenta[evtNo].size() ) + referenceData[batchNo].momenta[evtNo].resize( particleIndex + 1 ); + auto& fourVec = referenceData[batchNo].momenta[evtNo][particleIndex]; + for( unsigned int i = 0; i < fourVec.size(); ++i ) + { + EXPECT_TRUE( lineStr.good() ); + lineStr >> fourVec[i]; + } + EXPECT_TRUE( lineStr.eof() ); + } + } + return referenceData; + } + +} + +/** + * Test driver providing a common interface for testing different implementations. + * Users need to implement: + * - Functions to retrieve matrix element and 4-momenta. These are used in the tests. + * - Driver functions that run the madgraph workflow. + * + * Usage: + * ``` + * class TestImplementation : public TestDriverBase { + * + * } + * + * class TestImplementation2 : public TestDriverBase { + * + * } + * + * INSTANTIATE_TEST_SUITE_P( TestName, + * MadgraphTest, + * testing::Values( new TestImplementation, new TestImplementation2, ... ) ); + *``` + * + * For adapting the test workflow, see the .cc and adapt + * TEST_P(MadgraphTest, CompareMomentaAndME) + * + * To add a test that should be runnable with all test implementations that derive from TestDriverBase, add a new + * TEST_P(MadgraphTest, ) { + * + * } + */ +class TestDriverBase +{ + std::string m_refFileName; +public: + const unsigned int nparticle; + static constexpr unsigned int niter = 2; + static constexpr unsigned int gpublocks = 2; + static constexpr unsigned int gputhreads = 128; + static constexpr unsigned int nevt = gpublocks * gputhreads; + + TestDriverBase( unsigned int npart, const std::string& refFileName ) + : m_refFileName( refFileName ) + , nparticle( npart ) + { + } + TestDriverBase() = delete; + virtual ~TestDriverBase() {} + const std::string& getRefFileName() { return m_refFileName; } + + // ------------------------------------------------ + // Interface for retrieving info from madgraph + // ------------------------------------------------ + virtual fptype getMomentum( std::size_t evtNo, unsigned int particleNo, unsigned int component ) const = 0; + virtual fptype getMatrixElement( std::size_t evtNo ) const = 0; + + // ------------------------------------------------ + // Interface for steering madgraph run + // ------------------------------------------------ + virtual void prepareRandomNumbers( unsigned int iiter ) = 0; + virtual void prepareMomenta( fptype energy ) = 0; + virtual void runSigmaKin( std::size_t iiter ) = 0; + + /// Print the requested event into the stream. If the reference data has enough events, it will be printed as well. + void dumpParticles( std::ostream& stream, std::size_t ievt, unsigned int numParticles, unsigned int nDigit, const ReferenceData& referenceData ) const + { + const auto width = nDigit + 8; + for( unsigned int ipar = 0; ipar < numParticles; ipar++ ) + { + // NB: 'setw' affects only the next field (of any type) + stream << std::scientific // fixed format: affects all floats (default nDigit: 6) + << std::setprecision( nDigit ) + << std::setw( 4 ) << ipar + << std::setw( width ) << getMomentum( ievt, ipar, 0 ) + << std::setw( width ) << getMomentum( ievt, ipar, 1 ) + << std::setw( width ) << getMomentum( ievt, ipar, 2 ) + << std::setw( width ) << getMomentum( ievt, ipar, 3 ) + << "\n"; + if( ievt < referenceData.momenta.size() ) + { + stream << "ref" << ipar; + stream << std::setw( width ) << referenceData.momenta[ievt][ipar][0] + << std::setw( width ) << referenceData.momenta[ievt][ipar][1] + << std::setw( width ) << referenceData.momenta[ievt][ipar][2] + << std::setw( width ) << referenceData.momenta[ievt][ipar][3] + << "\n\n"; + } + stream << std::flush << std::defaultfloat; // default format: affects all floats + } + } +}; + +/** + * Test class that's defining all tests to run with a Madgraph workflow. + * The tests are defined below using TEST_P. + * Instantiate them using: + * ``` + * INSTANTIATE_TEST_SUITE_P( TestName, + * MadgraphTest, + * testing::Values( new TestImplementation, new TestImplementation2, ... ) ); + * ``` + */ +class MadgraphTest : public testing::TestWithParam +{ +protected: + std::unique_ptr testDriver; + + MadgraphTest() + : TestWithParam(), testDriver( GetParam() ) + { + } +}; + +// Since we link both the CPU-only and GPU tests into the same executable, we prevent +// a multiply defined symbol by only compiling this in the non-CUDA phase: +#ifndef __CUDACC__ + +/// Compare momenta and matrix elements. +/// This uses an implementation of TestDriverBase to run a madgraph workflow, +/// and compares momenta and matrix elements with a reference file. +TEST_P( MadgraphTest, CompareMomentaAndME ) +{ + // Set to true to dump events: + constexpr bool dumpEvents = false; + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; +#ifdef __APPLE__ + const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 +#else + const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; +#endif + std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; + while( dumpFileName.find( '/' ) != std::string::npos ) + { + dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" ); + } + std::ofstream dumpFile; + if( dumpEvents ) + { + dumpFile.open( dumpFileName, std::ios::trunc ); + } + // Read reference data + const std::string refFileName = testDriver->getRefFileName(); + std::map referenceData; + if( !dumpEvents ) + { + referenceData = readReferenceData( refFileName ); + } + ASSERT_FALSE( HasFailure() ); // It doesn't make any sense to continue if we couldn't read the reference file. + // ************************************** + // *** START MAIN LOOP ON #ITERATIONS *** + // ************************************** + for( unsigned int iiter = 0; iiter < testDriver->niter; ++iiter ) + { + testDriver->prepareRandomNumbers( iiter ); + testDriver->prepareMomenta( energy ); + testDriver->runSigmaKin( iiter ); + // --- Run checks on all events produced in this iteration + for( std::size_t ievt = 0; ievt < testDriver->nevt && !HasFailure(); ++ievt ) + { + if( dumpEvents ) + { + ASSERT_TRUE( dumpFile.is_open() ) << dumpFileName; + dumpFile << "Event " << std::setw( 8 ) << ievt << " " + << "Batch " << std::setw( 4 ) << iiter << "\n"; + testDriver->dumpParticles( dumpFile, ievt, testDriver->nparticle, 15, ReferenceData() ); + // Dump matrix element + dumpFile << std::setw( 4 ) << "ME" << std::scientific << std::setw( 15 + 8 ) + << testDriver->getMatrixElement( ievt ) << "\n" + << std::endl + << std::defaultfloat; + continue; + } + // Check that we have the required reference data + ASSERT_GT( referenceData.size(), iiter ) + << "Don't have enough reference data for iteration " << iiter << ". Ref file:" << refFileName; + ASSERT_GT( referenceData[iiter].MEs.size(), ievt ) + << "Don't have enough reference MEs for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName; + ASSERT_GT( referenceData[iiter].momenta.size(), ievt ) + << "Don't have enough reference momenta for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName; + ASSERT_GE( referenceData[iiter].momenta[ievt].size(), testDriver->nparticle ) + << "Don't have enough reference particles for iteration " << iiter << " event " << ievt << ".\nRef file: " << refFileName; + // This trace will help to understand the event that is being checked. + // It will only be printed in case of failures: + std::stringstream eventTrace; + eventTrace << "In comparing event " << ievt << " from iteration " << iiter << "\n"; + testDriver->dumpParticles( eventTrace, ievt, testDriver->nparticle, 15, referenceData[iiter] ); + eventTrace << std::setw( 4 ) << "ME" << std::scientific << std::setw( 15 + 8 ) + << testDriver->getMatrixElement( ievt ) << "\n" + << std::setw( 4 ) << "r.ME" << std::scientific << std::setw( 15 + 8 ) + << referenceData[iiter].MEs[ievt] << std::endl + << std::defaultfloat; + SCOPED_TRACE( eventTrace.str() ); + // Compare Momenta + for( unsigned int ipar = 0; ipar < testDriver->nparticle; ++ipar ) + { + std::stringstream momentumErrors; + for( unsigned int icomp = 0; icomp < mgOnGpu::np4; ++icomp ) + { + const fptype pMadg = testDriver->getMomentum( ievt, ipar, icomp ); + const fptype pOrig = referenceData[iiter].momenta[ievt][ipar][icomp]; + const fptype relDelta = fabs( ( pMadg - pOrig ) / pOrig ); + if( relDelta > toleranceMomenta ) + { + momentumErrors << std::setprecision( 15 ) << std::scientific << "\nparticle " << ipar << "\tcomponent " << icomp + << "\n\t madGraph: " << std::setw( 22 ) << pMadg + << "\n\t reference: " << std::setw( 22 ) << pOrig + << "\n\t rel delta: " << std::setw( 22 ) << relDelta << " exceeds tolerance of " << toleranceMomenta; + } + } + ASSERT_TRUE( momentumErrors.str().empty() ) << momentumErrors.str(); + } + // Compare ME: + EXPECT_NEAR( testDriver->getMatrixElement( ievt ), + referenceData[iiter].MEs[ievt], + toleranceMEs * referenceData[iiter].MEs[ievt] ); + } + } + if( dumpEvents ) + { + std::cout << "Event dump written to " << dumpFileName << std::endl; + } +} + +#endif // __CUDACC__ + +#endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc new file mode 100644 index 0000000000..da81c99218 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -0,0 +1,237 @@ +#include "MatrixElementKernels.h" + +#include "CPPProcess.h" +#include "CudaRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" + +#include + +//============================================================================ + +#ifndef __CUDACC__ +namespace mg5amcCpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelHost::MatrixElementKernelHost( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, matrixElements, selhel, selcol ) + , NumberOfEvents( nevt ) + , m_couplings( nevt ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( nevt ) + , m_denominators( nevt ) +#endif + { + if( m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: momenta must be a host array" ); + if( m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelHost: matrixElements must be a host array" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelHost: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelHost: nevt mismatch with matrixElements" ); + // Sanity checks for memory access (momenta buffer) + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHost: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation + // Note: this prevents a crash on pmpe04 but not on some github CI nodes? + // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] + if( !MatrixElementKernelHost::hostSupportsSIMD() ) + throw std::runtime_error( "Host does not support the SIMD implementation of MatrixElementKernelsHost" ); + } + + //-------------------------------------------------------------------------- + + int MatrixElementKernelHost::computeGoodHelicities() + { + using mgOnGpu::ncomb; // the number of helicity combinations + HostBufferHelicityMask hstIsGoodHel( ncomb ); + // ... 0d1. Compute good helicity mask on the host + computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), hstIsGoodHel.data(), nevt() ); +#else + sigmaKin_getGoodHel( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), hstIsGoodHel.data(), nevt() ); +#endif + // ... 0d2. Copy back good helicity list to static memory on the host + // [FIXME! REMOVE THIS STATIC THAT BREAKS MULTITHREADING?] + return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelHost::computeMatrixElements( const unsigned int channelId ) + { + computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data(), nevt() ); +#else + sigmaKin( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data(), nevt() ); +#endif + } + + //-------------------------------------------------------------------------- + + // Does this host system support the SIMD used in the matrix element calculation? + bool MatrixElementKernelHost::hostSupportsSIMD( const bool verbose ) + { +#if defined __AVX512VL__ + bool known = true; + bool ok = __builtin_cpu_supports( "avx512vl" ); + const std::string tag = "skylake-avx512 (AVX512VL)"; +#elif defined __AVX2__ + bool known = true; + bool ok = __builtin_cpu_supports( "avx2" ); + const std::string tag = "haswell (AVX2)"; +#elif defined __SSE4_2__ +#ifdef __PPC__ + // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html + bool known = true; + bool ok = __builtin_cpu_supports( "vsx" ); + const std::string tag = "powerpc vsx (128bit as in SSE4.2)"; +#elif defined __ARM_NEON__ // consider using __BUILTIN_CPU_SUPPORTS__ + bool known = false; // __builtin_cpu_supports is not supported + // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html + // See https://stackoverflow.com/q/62783908 + // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu + bool ok = true; // this is just an assumption! + const std::string tag = "arm neon (128bit as in SSE4.2)"; +#else + bool known = true; + bool ok = __builtin_cpu_supports( "sse4.2" ); + const std::string tag = "nehalem (SSE4.2)"; +#endif +#else + bool known = true; + bool ok = true; + const std::string tag = "none"; +#endif + if( verbose ) + { + if( tag == "none" ) + std::cout << "INFO: The application does not require the host to support any AVX feature" << std::endl; + else if( ok && known ) + std::cout << "INFO: The application is built for " << tag << " and the host supports it" << std::endl; + else if( ok ) + std::cout << "WARNING: The application is built for " << tag << " but it is unknown if the host supports it" << std::endl; + else + std::cout << "ERROR! The application is built for " << tag << " but the host does not support it" << std::endl; + } + return ok; + } + + //-------------------------------------------------------------------------- + +} +#endif + +//============================================================================ + +#ifdef __CUDACC__ +namespace mg5amcGpu +{ + + //-------------------------------------------------------------------------- + + MatrixElementKernelDevice::MatrixElementKernelDevice( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t gpublocks, + const size_t gputhreads ) + : MatrixElementKernelBase( momenta, gs, rndhel, rndcol, matrixElements, selhel, selcol ) + , NumberOfEvents( gpublocks * gputhreads ) + , m_couplings( this->nevt() ) +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + , m_numerators( this->nevt() ) + , m_denominators( this->nevt() ) +#endif + , m_gpublocks( gpublocks ) + , m_gputhreads( gputhreads ) + { + if( !m_momenta.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: momenta must be a device array" ); + if( !m_matrixElements.isOnDevice() ) throw std::runtime_error( "MatrixElementKernelDevice: matrixElements must be a device array" ); + if( m_gpublocks == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gpublocks must be > 0" ); + if( m_gputhreads == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gputhreads must be > 0" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "MatrixElementKernelDevice: nevt mismatch with momenta" ); + if( this->nevt() != m_matrixElements.nevt() ) throw std::runtime_error( "MatrixElementKernelDevice: nevt mismatch with matrixElements" ); + // Sanity checks for memory access (momenta buffer) + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( m_gputhreads % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "MatrixElementKernelHost: gputhreads should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelDevice::setGrid( const int gpublocks, const int gputhreads ) + { + if( m_gpublocks == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gpublocks must be > 0 in setGrid" ); + if( m_gputhreads == 0 ) throw std::runtime_error( "MatrixElementKernelDevice: gputhreads must be > 0 in setGrid" ); + if( this->nevt() != m_gpublocks * m_gputhreads ) throw std::runtime_error( "MatrixElementKernelDevice: nevt mismatch in setGrid" ); + } + + //-------------------------------------------------------------------------- + + int MatrixElementKernelDevice::computeGoodHelicities() + { + using mgOnGpu::ncomb; // the number of helicity combinations + PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); + DeviceBufferHelicityMask devIsGoodHel( ncomb ); + // ... 0d1. Compute good helicity mask on the device + computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); +#else + sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); +#endif + checkCuda( cudaPeekAtLastError() ); + // ... 0d2. Copy back good helicity mask to the host + copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); + // ... 0d3. Copy back good helicity list to constant memory on the device + return sigmaKin_setGoodHel( hstIsGoodHel.data() ); + } + + //-------------------------------------------------------------------------- + + void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) + { + computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); +#ifndef MGONGPU_NSIGHT_DEBUG + constexpr unsigned int sharedMemSize = 0; +#else + constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); +#endif +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); +#else + sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); +#endif + checkCuda( cudaPeekAtLastError() ); + checkCuda( cudaDeviceSynchronize() ); + } + + //-------------------------------------------------------------------------- + +} +#endif + +//============================================================================ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h new file mode 100644 index 0000000000..ec0fc9b18c --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -0,0 +1,183 @@ +#ifndef MATRIXELEMENTKERNELS_H +#define MATRIXELEMENTKERNELS_H 1 + +#include "mgOnGpuConfig.h" + +#include "MemoryBuffers.h" + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + // A base class encapsulating matrix element calculations on a CPU host or on a GPU device + class MatrixElementKernelBase //: virtual public IMatrixElementKernel + { + protected: + + // Constructor from existing input and output buffers + MatrixElementKernelBase( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol ) // output: color selection + : m_momenta( momenta ) + , m_gs( gs ) + , m_rndhel( rndhel ) + , m_rndcol( rndcol ) + , m_matrixElements( matrixElements ) + , m_selhel( selhel ) + , m_selcol( selcol ) + { + } + + public: + + // Destructor + virtual ~MatrixElementKernelBase() {} + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + virtual int computeGoodHelicities() = 0; + + // Compute matrix elements + virtual void computeMatrixElements( const unsigned int channelId ) = 0; + + // Is this a host or device kernel? + virtual bool isOnDevice() const = 0; + + protected: + + // The buffer for the input momenta + const BufferMomenta& m_momenta; + + // The buffer for the gs to calculate the alphaS values + const BufferGs& m_gs; + + // The buffer for the random numbers for helicity selection + const BufferRndNumHelicity& m_rndhel; + + // The buffer for the random numbers for color selection + const BufferRndNumColor& m_rndcol; + + // The buffer for the output matrix elements + BufferMatrixElements& m_matrixElements; + + // The buffer for the output helicity selection + BufferSelectedHelicity& m_selhel; + + // The buffer for the output color selection + BufferSelectedColor& m_selcol; + }; + + //-------------------------------------------------------------------------- + +#ifndef __CUDACC__ + // A class encapsulating matrix element calculations on a CPU host + class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelHost( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t nevt ); + + // Destructor + virtual ~MatrixElementKernelHost() {} + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const unsigned int channelId ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + // Does this host system support the SIMD used in the matrix element calculation? + // [NB: SIMD vectorization in mg5amc C++ code is currently only used in the ME calculations below MatrixElementKernelHost!] + static bool hostSupportsSIMD( const bool verbose = true ); + + private: + + // The buffer for the event-by-event couplings that depends on alphas QCD + HostBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + HostBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + HostBufferDenominators m_denominators; +#endif + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + // A class encapsulating matrix element calculations on a GPU device + class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + MatrixElementKernelDevice( const BufferMomenta& momenta, // input: momenta + const BufferGs& gs, // input: gs for alphaS + const BufferRndNumHelicity& rndhel, // input: random numbers for helicity selection + const BufferRndNumColor& rndcol, // input: random numbers for color selection + BufferMatrixElements& matrixElements, // output: matrix elements + BufferSelectedHelicity& selhel, // output: helicity selection + BufferSelectedColor& selcol, // output: color selection + const size_t gpublocks, + const size_t gputhreads ); + + // Destructor + virtual ~MatrixElementKernelDevice() {} + + // Reset gpublocks and gputhreads + void setGrid( const int gpublocks, const int gputhreads ); + + // Compute good helicities (returns nGoodHel, the number of good helicity combinations out of ncomb) + int computeGoodHelicities() override final; + + // Compute matrix elements + void computeMatrixElements( const unsigned int channelId ) override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return true; } + + private: + + // The buffer for the event-by-event couplings that depends on alphas QCD + DeviceBufferCouplings m_couplings; + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // The buffer for the event-by-event numerators of multichannel factors + DeviceBufferNumerators m_numerators; + + // The buffer for the event-by-event denominators of multichannel factors + DeviceBufferDenominators m_denominators; +#endif + + // The number of blocks in the GPU grid + size_t m_gpublocks; + + // The number of threads in the GPU grid + size_t m_gputhreads; + }; +#endif + + //-------------------------------------------------------------------------- +} +#endif // MATRIXELEMENTKERNELS_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h new file mode 100644 index 0000000000..f3ab497b7a --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -0,0 +1,150 @@ +#ifndef MemoryAccessAmplitudes_H +#define MemoryAccessAmplitudes_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuCxtypes.h" + +#include "MemoryAccessHelpers.h" + +#define MGONGPU_TRIVIAL_AMPLITUDES 1 + +//---------------------------------------------------------------------------- + +#ifndef MGONGPU_TRIVIAL_AMPLITUDES + +// A class describing the internal layout of memory buffers for amplitudes +// This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA +// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] +class MemoryAccessAmplitudesBase //_AOSOAv1 +{ +public: + + // Number of Events Per Page in the amplitude AOSOA memory buffer layout + static constexpr int neppA = 1; // AOS (just a test...) + +private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + // The number of floating point components of a complex number + static constexpr int nx2 = mgOnGpu::nx2; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + const int ipagA = ievt / neppA; // #event "A-page" + const int ieppA = ievt % neppA; // #event in the current event A-page + constexpr int ix2 = 0; + return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer, + const int ix2 ) + { + constexpr int ipagA = 0; + constexpr int ieppA = 0; + return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] + } +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on explicit event numbers +// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations +class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase +{ +public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] + static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] + static constexpr auto decodeRecordIx2Const = + MemoryAccessHelper::template decodeRecordConst; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] + static constexpr auto ieventAccessIx2 = + MemoryAccessHelper::template ieventAccessField; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] + static constexpr auto ieventAccessIx2Const = + MemoryAccessHelper::template ieventAccessFieldConst; +}; + +#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on implicit kernel rules +// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations +template +class KernelAccessAmplitudes +{ +public: + +#ifndef MGONGPU_TRIVIAL_AMPLITUDES + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] + static constexpr auto kernelAccessIx2 = + KernelAccessHelper::template kernelAccessField; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] + static constexpr auto kernelAccessIx2Const = + KernelAccessHelper::template kernelAccessFieldConst; + +#else + + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + +#endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES +}; + +//---------------------------------------------------------------------------- + +typedef KernelAccessAmplitudes HostAccessAmplitudes; +typedef KernelAccessAmplitudes DeviceAccessAmplitudes; + +//---------------------------------------------------------------------------- + +#endif // MemoryAccessAmplitudes_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h new file mode 100644 index 0000000000..141d24ec71 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h @@ -0,0 +1,256 @@ +#ifndef MemoryAccessCouplings_H +#define MemoryAccessCouplings_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuCxtypes.h" + +#include "MemoryAccessHelpers.h" +#include "MemoryAccessMomenta.h" // for MemoryAccessMomentaBase::neppM +#include "MemoryBuffers.h" // for HostBufferCouplings::isaligned + +//---------------------------------------------------------------------------- + +// A class describing the internal layout of memory buffers for couplings +// This implementation uses an AOSOA[npagC][ndcoup][nx2][neppC] "super-buffer" where nevt=npagC*neppC +// From the "super-buffer" for ndcoup different couplings, use idcoupAccessBuffer to access the buffer for one specific coupling +// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] +class MemoryAccessCouplingsBase //_AOSOAv1 +{ +public: + + // Number of Events Per Page in the coupling AOSOA memory buffer layout + static constexpr int neppC = MemoryAccessMomentaBase::neppM; // use the same AOSOA striding as for momenta + + // SANITY CHECK: check that neppC is a power of two + static_assert( ispoweroftwo( neppC ), "neppC is not a power of 2" ); + + //-------------------------------------------------------------------------- + // ** NB! A single super-buffer AOSOA[npagC][ndcoup][nx2][neppC] includes data for ndcoup different couplings ** + // ** NB! The ieventAccessRecord and kernelAccess functions refer to the buffer for one individual coupling ** + // ** NB! Use idcoupAccessBuffer to add a fixed offset and locate the buffer for one given individual coupling ** + //-------------------------------------------------------------------------- + + // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input) + // [Signature (non-const) ===> fptype* idcoupAccessBuffer( fptype* buffer, const int idcoup ) <===] + // NB: keep this in public even if exposed through KernelAccessCouplings: nvcc says it is inaccesible otherwise? + static __host__ __device__ inline fptype* + idcoupAccessBuffer( fptype* buffer, // input "super-buffer" + const int idcoup ) + { + constexpr int ipagC = 0; + constexpr int ieppC = 0; + constexpr int ix2 = 0; + // NB! this effectively adds an offset "idcoup * nx2 * neppC" + return &( buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC] ); // AOSOA[ipagC][idcoup][ix2][ieppC] + } + + // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input) + // [Signature (const) ===> const fptype* idcoupAccessBufferConst( const fptype* buffer, const int idcoup ) <===] + // NB: keep this in public even if exposed through KernelAccessCouplings: nvcc says it is inaccesible otherwise? + static __host__ __device__ inline const fptype* + idcoupAccessBufferConst( const fptype* buffer, // input "super-buffer" + const int idcoup ) + { + return idcoupAccessBuffer( const_cast( buffer ), idcoup ); + } + +private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + // The number of couplings that dependent on the running alphas QCD in this specific process + static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; + + // The number of floating point components of a complex number + static constexpr int nx2 = mgOnGpu::nx2; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + const int ipagC = ievt / neppC; // #event "C-page" + const int ieppC = ievt % neppC; // #event in the current event C-page + constexpr int idcoup = 0; + constexpr int ix2 = 0; + return &( buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC] ); // AOSOA[ipagC][idcoup][ix2][ieppC] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer, + const int ix2 ) + { + constexpr int ipagC = 0; + constexpr int ieppC = 0; + // NB! the offset "idcoup * nx2 * neppC" has been added in idcoupAccessBuffer + constexpr int idcoup = 0; + return buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC]; // AOSOA[ipagC][idcoup][ix2][ieppC] + } +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on explicit event numbers +// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations +class MemoryAccessCouplings : public MemoryAccessCouplingsBase +{ +public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] + static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] + static constexpr auto decodeRecordIx2Const = + MemoryAccessHelper::template decodeRecordConst; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] + static constexpr auto ieventAccessIx2 = + MemoryAccessHelper::template ieventAccessField; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] + static constexpr auto ieventAccessIx2Const = + MemoryAccessHelper::template ieventAccessFieldConst; +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on implicit kernel rules +// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations +template +class KernelAccessCouplings +{ +public: + + // Expose selected functions from MemoryAccessCouplingsBase + static constexpr auto idcoupAccessBuffer = MemoryAccessCouplingsBase::idcoupAccessBuffer; + static constexpr auto idcoupAccessBufferConst = MemoryAccessCouplingsBase::idcoupAccessBufferConst; + + // Expose selected functions from MemoryAccessCouplings + static constexpr auto ieventAccessRecordConst = MemoryAccessCouplings::ieventAccessRecordConst; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const, SCALAR) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] + static constexpr auto kernelAccessIx2_s = + KernelAccessHelper::template kernelAccessField; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] + static constexpr auto kernelAccessIx2Const_s = + KernelAccessHelper::template kernelAccessFieldConst; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] + static __host__ __device__ inline fptype_sv& + kernelAccessIx2( fptype* buffer, + const int ix2 ) + { + fptype& out = kernelAccessIx2_s( buffer, ix2 ); +#ifndef MGONGPU_CPPSIMD + return out; +#else + // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays + constexpr int neppC = MemoryAccessCouplingsBase::neppC; + static_assert( neppC >= neppV ); // ASSUME CONTIGUOUS ARRAYS + static_assert( neppC % neppV == 0 ); // ASSUME CONTIGUOUS ARRAYS + static_assert( mg5amcCpu::HostBufferCouplings::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast +#endif + } + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] + static __host__ __device__ inline const fptype_sv& + kernelAccessIx2Const( const fptype* buffer, + const int ix2 ) + { + return kernelAccessIx2( const_cast( buffer ), ix2 ); + } + + /* + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] + static __host__ __device__ inline const fptype_sv& + kernelAccessIx2Const( const fptype* buffer, + const int ix2 ) + { + const fptype& out = kernelAccessIx2Const_s( buffer, ix2 ); +#ifndef MGONGPU_CPPSIMD + return out; +#else + // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays + constexpr int neppC = MemoryAccessCouplingsBase::neppC; + static_assert( neppC >= neppV ); // ASSUME CONTIGUOUS ARRAYS + static_assert( neppC % neppV == 0 ); // ASSUME CONTIGUOUS ARRAYS + static_assert( mg5amcCpu::HostBufferCouplings::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast +#endif + } + */ + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non const, SCALAR OR VECTOR) ===> cxtype_sv_ref kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline cxtype_sv_ref + kernelAccess( fptype* buffer ) + { + /* + fptype_sv& real = kernelAccessIx2( buffer, 0 ); + fptype_sv& imag = kernelAccessIx2( buffer, 1 ); + printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + return cxtype_sv_ref( real, imag ); + */ + return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), + kernelAccessIx2( buffer, 1 ) ); + } + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR OR VECTOR) ===> cxtype_sv kernelAccessConst( const fptype* buffer ) <===] + static __host__ __device__ inline cxtype_sv + kernelAccessConst( const fptype* buffer ) + { + /* + const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); + const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); + printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + return cxtype_sv( real, imag ); + */ + return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), + kernelAccessIx2Const( buffer, 1 ) ); + } +}; + +//---------------------------------------------------------------------------- + +typedef KernelAccessCouplings HostAccessCouplings; +typedef KernelAccessCouplings DeviceAccessCouplings; + +//---------------------------------------------------------------------------- + +#endif // MemoryAccessCouplings_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h new file mode 100644 index 0000000000..0f9850baf2 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -0,0 +1,70 @@ +#ifndef MemoryAccessCouplingsFixed_H +#define MemoryAccessCouplingsFixed_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuCxtypes.h" +#include "mgOnGpuVectors.h" + +//#include "MemoryAccessHelpers.h" + +//---------------------------------------------------------------------------- + +// A class describing the internal layout of memory buffers for fixed couplings +// This implementation uses a STRUCT[ndcoup][nx2] "super-buffer" layout: in practice, the cIPC global array +// From the "super-buffer" for ndcoup different couplings, use idcoupAccessBuffer to access the buffer for one specific coupling +// [If many implementations are used, a suffix _Sv1 should be appended to the class name] +class MemoryAccessCouplingsFixedBase //_Sv1 +{ +public: + + // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input) + // [Signature (const) ===> const fptype* iicoupAccessBufferConst( const fptype* buffer, const int iicoup ) <===] + static __host__ __device__ inline const fptype* + iicoupAccessBufferConst( const fptype* buffer, // input "super-buffer": in practice, the cIPC global array + const int iicoup ) + { + constexpr int ix2 = 0; + // NB! this effectively adds an offset "iicoup * nx2" + return &( buffer[iicoup * nx2 + ix2] ); // STRUCT[idcoup][ix2] + } + +private: + + // The number of floating point components of a complex number + static constexpr int nx2 = mgOnGpu::nx2; +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on implicit kernel rules +// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations +template +class KernelAccessCouplingsFixed +{ +public: + + // Expose selected functions from MemoryAccessCouplingsFixedBase + static constexpr auto iicoupAccessBufferConst = MemoryAccessCouplingsFixedBase::iicoupAccessBufferConst; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR OR VECTOR) ===> cxtype_sv kernelAccessConst( const fptype* buffer ) <===] + static __host__ __device__ inline const cxtype_sv + kernelAccessConst( const fptype* buffer ) + { + // TRIVIAL ACCESS to fixed-couplings buffers! + //return cxmake( fptype_sv{ buffer[0] }, fptype_sv{ buffer[1] } ); // NO! BUG #339! + const fptype_sv r_sv = fptype_sv{ 0 } + buffer[0]; + const fptype_sv i_sv = fptype_sv{ 0 } + buffer[1]; + return cxmake( r_sv, i_sv ); // ugly but effective + } +}; + +//---------------------------------------------------------------------------- + +typedef KernelAccessCouplingsFixed HostAccessCouplingsFixed; +typedef KernelAccessCouplingsFixed DeviceAccessCouplingsFixed; + +//---------------------------------------------------------------------------- + +#endif // MemoryAccessCouplingsFixed_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessDenominators.h new file mode 100644 index 0000000000..7a4a80ebd9 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessDenominators.h @@ -0,0 +1,18 @@ +#ifndef MemoryAccessDenominators_H +#define MemoryAccessDenominators_H 1 +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + +#include "MemoryAccessGs.h" + +//---------------------------------------------------------------------------- + +// A class describing the internal layout of memory buffers for denominators +// This implementation reuses the plain ARRAY[nevt] implementation of MemoryAccessGs + +typedef KernelAccessGs HostAccessDenominators; +typedef KernelAccessGs DeviceAccessDenominators; + +//---------------------------------------------------------------------------- + +#endif +#endif // MemoryAccessDenominators_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h new file mode 100644 index 0000000000..f233d64b9c --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h @@ -0,0 +1,148 @@ +#ifndef MemoryAccessGs_H +#define MemoryAccessGs_H 1 + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessHelpers.h" +#include "MemoryAccessVectors.h" +#include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned + +//---------------------------------------------------------------------------- + +// A class describing the internal layout of memory buffers for Gs +// This implementation uses a plain ARRAY[nevt] +// [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name] +class MemoryAccessGsBase //_ARRAYv1 +{ +private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + return &( buffer[ievt] ); // ARRAY[nevt] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer ) + { + constexpr int ievt = 0; + return buffer[ievt]; // ARRAY[nevt] + } +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on explicit event numbers +// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations +class MemoryAccessGs : public MemoryAccessGsBase +{ +public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===] + static constexpr auto decodeRecord = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===] + static constexpr auto decodeRecordConst = + MemoryAccessHelper::template decodeRecordConst<>; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===] + static constexpr auto ieventAccess = + MemoryAccessHelper::template ieventAccessField<>; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===] + static constexpr auto ieventAccessConst = + MemoryAccessHelper::template ieventAccessFieldConst<>; +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on implicit kernel rules +// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations +template +class KernelAccessGs +{ +public: + + // Expose selected functions from MemoryAccessGs + static constexpr auto ieventAccessRecord = MemoryAccessGs::ieventAccessRecord; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const, SCALAR) ===> fptype& kernelAccess( fptype* buffer ) <===] + static constexpr auto kernelAccess_s = + KernelAccessHelper::template kernelAccessField<>; // requires cuda 11.4 + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (non-const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv& + kernelAccess( fptype* buffer ) + { + fptype& out = kernelAccess_s( buffer ); +#ifndef MGONGPU_CPPSIMD + return out; +#else + // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435) + static_assert( mg5amcCpu::HostBufferGs::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast +#endif + } + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] + static constexpr auto kernelAccessConst_s = + KernelAccessHelper::template kernelAccessFieldConst<>; // requires cuda 11.4 + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccess( const fptype* buffer ) <===] + static __host__ __device__ inline const fptype_sv& + kernelAccessConst( const fptype* buffer ) + { + const fptype& out = kernelAccessConst_s( buffer ); +#ifndef MGONGPU_CPPSIMD + return out; +#else + // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435) + static_assert( mg5amcCpu::HostBufferGs::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast +#endif + } +}; + +//---------------------------------------------------------------------------- + +typedef KernelAccessGs HostAccessGs; +typedef KernelAccessGs DeviceAccessGs; + +//---------------------------------------------------------------------------- + +#endif // MemoryAccessGs_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessHelpers.h new file mode 100644 index 0000000000..aa3016c9a1 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessHelpers.h @@ -0,0 +1,152 @@ +#ifndef MemoryAccessHelpers_H +#define MemoryAccessHelpers_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuFptypes.h" + +//---------------------------------------------------------------------------- + +// A templated helper class that includes the boilerplate code for MemoryAccess classes +template +class MemoryAccessHelper +{ +public: + + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = T::ieventAccessRecord; + + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline const fptype* + ieventAccessRecordConst( const fptype* buffer, + const int ievt ) + { + return ieventAccessRecord( const_cast( buffer ), ievt ); + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + static constexpr auto decodeRecord = T::decodeRecord; + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, Ts... args ) <===] + template + static __host__ __device__ inline const fptype& + decodeRecordConst( const fptype* buffer, + Ts... args ) // variadic template + { + return T::decodeRecord( const_cast( buffer ), args... ); + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccessField( fptype* buffer, const ievt, Ts... args ) <===] + template + static __host__ __device__ inline fptype& + ieventAccessField( fptype* buffer, + const int ievt, + Ts... args ) // variadic template + { + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + return T::decodeRecord( T::ieventAccessRecord( buffer, ievt ), args... ); + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessFieldConst( const fptype* buffer, const ievt, Ts... args ) <===] + template + static __host__ __device__ inline const fptype& + ieventAccessFieldConst( const fptype* buffer, + const int ievt, + Ts... args ) // variadic template + { + return ieventAccessField( const_cast( buffer ), ievt, args... ); + } +}; + +//---------------------------------------------------------------------------- + +// A templated helper class that includes the boilerplate code for KernelAccess classes +template +class KernelAccessHelper : public MemoryAccessHelper +{ +public: + + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (non-const) ===> fptype* kernelAccessRecord( fptype* buffer ) <===] + static __host__ __device__ inline fptype* + kernelAccessRecord( fptype* buffer ) + { + if constexpr( !onDevice ) // requires c++17 also in CUDA (#333) + { + // FIXME #436: clarify that buffer includes all events on device, and only the record for an event subset on host! + // FIXME #436: am I not assuming that the following line is always identical to buffer for all access classes T? + return T::ieventAccessRecord( buffer, 0 ); + } + else + { +#ifdef __CUDACC__ + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); + return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA +#else + throw std::runtime_error( "kernelAccessRecord on device is only implemented in CUDA" ); +#endif + } + } + + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (const) ===> const fptype* kernelAccessRecordConst( const fptype* buffer ) <===] + static __host__ __device__ inline const fptype* + kernelAccessRecordConst( const fptype* buffer ) + { + return kernelAccessRecord( const_cast( buffer ) ); + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const) ===> fptype& kernelAccessField( fptype* buffer, Ts... args ) <===] + template + static __host__ __device__ inline fptype& + kernelAccessField( fptype* buffer, + Ts... args ) // variadic template + { + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + return T::decodeRecord( kernelAccessRecord( buffer ), args... ); + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const) ===> const fptype& kernelAccessFieldConst( const fptype* buffer, Ts... args ) <===] + template + static __host__ __device__ inline const fptype& + kernelAccessFieldConst( const fptype* buffer, + Ts... args ) // variadic template + { + return kernelAccessField( const_cast( buffer ), args... ); + } + + //-------------------------------------------------------------------------- +}; + +#endif // MemoryAccessHelpers_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h new file mode 100644 index 0000000000..05f0810807 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -0,0 +1,132 @@ +#ifndef MemoryAccessMatrixElements_H +#define MemoryAccessMatrixElements_H 1 + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessHelpers.h" +#include "MemoryAccessVectors.h" +#include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned + +//---------------------------------------------------------------------------- + +// A class describing the internal layout of memory buffers for matrix elements +// This implementation uses a plain ARRAY[nevt] +// [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name] +class MemoryAccessMatrixElementsBase //_ARRAYv1 +{ +private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + return &( buffer[ievt] ); // ARRAY[nevt] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer ) + { + constexpr int ievt = 0; + return buffer[ievt]; // ARRAY[nevt] + } +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on explicit event numbers +// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations +class MemoryAccessMatrixElements : public MemoryAccessMatrixElementsBase +{ +public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===] + static constexpr auto decodeRecord = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===] + static constexpr auto decodeRecordConst = + MemoryAccessHelper::template decodeRecordConst<>; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===] + static constexpr auto ieventAccess = + MemoryAccessHelper::template ieventAccessField<>; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===] + static constexpr auto ieventAccessConst = + MemoryAccessHelper::template ieventAccessFieldConst<>; +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on implicit kernel rules +// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations +template +class KernelAccessMatrixElements +{ +public: + + // Expose selected functions from MemoryAccessMatrixElements + static constexpr auto ieventAccessRecord = MemoryAccessMatrixElements::ieventAccessRecord; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const, SCALAR) ===> fptype& kernelAccess_s( fptype* buffer ) <===] + static constexpr auto kernelAccess_s = + KernelAccessHelper::template kernelAccessField<>; // requires cuda 11.4 + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (non const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccess( const fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv& + kernelAccess( fptype* buffer ) + { + fptype& out = kernelAccess_s( buffer ); +#ifndef MGONGPU_CPPSIMD + return out; +#else + // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435) + static_assert( mg5amcCpu::HostBufferMatrixElements::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast +#endif + } + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] + static constexpr auto kernelAccessConst = + KernelAccessHelper::template kernelAccessFieldConst<>; // requires cuda 11.4 +}; + +//---------------------------------------------------------------------------- + +typedef KernelAccessMatrixElements HostAccessMatrixElements; +typedef KernelAccessMatrixElements DeviceAccessMatrixElements; + +//---------------------------------------------------------------------------- + +#endif // MemoryAccessMatrixElements_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMomenta.h new file mode 100644 index 0000000000..ace50b40e8 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMomenta.h @@ -0,0 +1,260 @@ +#ifndef MemoryAccessMomenta_H +#define MemoryAccessMomenta_H 1 + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessHelpers.h" +#include "MemoryAccessVectors.h" + +//---------------------------------------------------------------------------- + +// A class describing the internal layout of memory buffers for momenta +// This implementation uses an AOSOA[npagM][npar][np4][neppM] where nevt=npagM*neppM +// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] +class MemoryAccessMomentaBase //_AOSOAv1 +{ +public: + + // Number of Events Per Page in the momenta AOSOA memory buffer layout + // (these are all best kept as a compile-time constants: see issue #23) +#ifdef __CUDACC__ /* clang-format off */ + // ----------------------------------------------------------------------------------------------- + // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline + // --- This is relevant to ensure coalesced access to momenta in global memory + // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms + // ----------------------------------------------------------------------------------------------- + //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT) + static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT) + //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu) +#else + // ----------------------------------------------------------------------------------------------- + // --- CPUs: neppM is best set equal to the number of fptype's (neppV) in a vector register + // --- This is relevant to ensure faster access to momenta from C++ memory cache lines + // --- However, neppM is now decoupled from neppV (issue #176) and can be separately hardcoded + // --- In practice, neppR, neppM and neppV could now (in principle) all be different + // ----------------------------------------------------------------------------------------------- +#ifdef MGONGPU_CPPSIMD + static constexpr int neppM = MGONGPU_CPPSIMD; // (DEFAULT) neppM=neppV for optimal performance + //static constexpr int neppM = 64/sizeof(fptype); // maximum CPU vector width (512 bits): 8 (DOUBLE) or 16 (FLOAT) + //static constexpr int neppM = 32/sizeof(fptype); // lower CPU vector width (256 bits): 4 (DOUBLE) or 8 (FLOAT) + //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 4.66E6 instead of 5.09E9 in eemumu) + //static constexpr int neppM = MGONGPU_CPPSIMD*2; // FOR TESTS +#else + static constexpr int neppM = 1; // (DEFAULT) neppM=neppV for optimal performance (NB: this is equivalent to AOS) +#endif +#endif /* clang-format on */ + + // SANITY CHECK: check that neppM is a power of two + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + +private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + // The number of components of a 4-momentum + static constexpr int np4 = mgOnGpu::np4; + + // The number of particles in this physics process + static constexpr int npar = mgOnGpu::npar; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + const int ipagM = ievt / neppM; // #event "M-page" + const int ieppM = ievt % neppM; // #event in the current event M-page + constexpr int ip4 = 0; + constexpr int ipar = 0; + return &( buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM] ); // AOSOA[ipagM][ipar][ip4][ieppM] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to "const int ip4, const int ipar" and rename "Field" as "Ip4Ipar"] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer, + const int ip4, + const int ipar ) + { + constexpr int ipagM = 0; + constexpr int ieppM = 0; + return buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM]; // AOSOA[ipagM][ipar][ip4][ieppM] + } +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on explicit event numbers +// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations +class MemoryAccessMomenta : public MemoryAccessMomentaBase +{ +public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ipar, const int ipar ) <===] + static constexpr auto decodeRecordIp4Ipar = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ipar, const int ipar ) <===] + static constexpr auto decodeRecordIp4IparConst = + MemoryAccessHelper::template decodeRecordConst; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccessIp4Ipar( fptype* buffer, const ievt, const int ipar, const int ipar ) <===] + static constexpr auto ieventAccessIp4Ipar = + MemoryAccessHelper::template ieventAccessField; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===] + // DEFAULT VERSION + static constexpr auto ieventAccessIp4IparConst = + MemoryAccessHelper::template ieventAccessFieldConst; + + /* + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===] + // DEBUG VERSION WITH PRINTOUTS + static __host__ __device__ inline const fptype& + ieventAccessIp4IparConst( const fptype* buffer, + const int ievt, + const int ip4, + const int ipar ) + { + const fptype& out = MemoryAccessHelper::template ieventAccessFieldConst( buffer, ievt, ip4, ipar ); + printf( "ipar=%2d ip4=%2d ievt=%8d out=%8.3f\n", ipar, ip4, ievt, out ); + return out; + } + */ +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on implicit kernel rules +// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations +template +class KernelAccessMomenta +{ +public: + + // Expose selected functions from MemoryAccessMomenta + static constexpr auto ieventAccessRecordConst = MemoryAccessMomenta::ieventAccessRecordConst; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const, SCALAR) ===> fptype& kernelAccessIp4Ipar( fptype* buffer, const int ipar, const int ipar ) <===] + static constexpr auto kernelAccessIp4Ipar = + KernelAccessHelper::template kernelAccessField; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===] + // DEFAULT VERSION + static constexpr auto kernelAccessIp4IparConst_s = + KernelAccessHelper::template kernelAccessFieldConst; + + /* + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===] + // DEBUG VERSION WITH PRINTOUTS + static __host__ __device__ inline const fptype& + kernelAccessIp4IparConst_s( const fptype* buffer, + const int ip4, + const int ipar ) + { + const fptype& out = KernelAccessHelper::template kernelAccessFieldConst( buffer, ip4, ipar ); + printf( "ipar=%2d ip4=%2d ievt='kernel' out=%8.3f\n", ipar, ip4, out ); + return out; + } + */ + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR OR VECTOR) ===> fptype_sv kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===] + // FIXME? Eventually return by const reference and support aligned arrays only? + // FIXME? Currently return by value to support also unaligned and arbitrary arrays + static __host__ __device__ inline fptype_sv + kernelAccessIp4IparConst( const fptype* buffer, + const int ip4, + const int ipar ) + { + const fptype& out = kernelAccessIp4IparConst_s( buffer, ip4, ipar ); +#ifndef MGONGPU_CPPSIMD + return out; +#else + constexpr int neppM = MemoryAccessMomentaBase::neppM; + constexpr bool useContiguousEventsIfPossible = true; // DEFAULT + //constexpr bool useContiguousEventsIfPossible = false; // FOR PERFORMANCE TESTS (treat as arbitrary array even if it is an AOSOA) + // Use c++17 "if constexpr": compile-time branching + if constexpr( useContiguousEventsIfPossible && ( neppM >= neppV ) && ( neppM % neppV == 0 ) ) + { + //constexpr bool skipAlignmentCheck = true; // FASTEST (SEGFAULTS IF MISALIGNED ACCESS, NEEDS A SANITY CHECK ELSEWHERE!) + constexpr bool skipAlignmentCheck = false; // DEFAULT: A BIT SLOWER BUT SAFER [ALLOWS MISALIGNED ACCESS] + if constexpr( skipAlignmentCheck ) + { + //static bool first=true; if( first ){ std::cout << "WARNING! assume aligned AOSOA, skip check" << std::endl; first=false; } // SLOWER (5.06E6) + // FASTEST? (5.09E6 in eemumu 512y) + // This assumes alignment for momenta1d without checking - causes segmentation fault in reinterpret_cast if not aligned! + return mg5amcCpu::fptypevFromAlignedArray( out ); // use reinterpret_cast + } + else if( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ) + { + //static bool first=true; if( first ){ std::cout << "WARNING! aligned AOSOA, reinterpret cast" << std::endl; first=false; } // SLOWER (5.00E6) + // DEFAULT! A tiny bit (<1%) slower because of the alignment check (5.07E6 in eemumu 512y) + // This explicitly checks buffer alignment to avoid segmentation faults in reinterpret_cast + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast + } + else + { + //static bool first=true; if( first ){ std::cout << "WARNING! AOSOA but no reinterpret cast" << std::endl; first=false; } // SLOWER (4.93E6) + // A bit (1%) slower (5.05E6 in eemumu 512y) + // This does not require buffer alignment, but it requires AOSOA with neppM>=neppV and neppM%neppV==0 + return mg5amcCpu::fptypevFromUnalignedArray( out ); // SIMD bulk load of neppV, do not use reinterpret_cast (fewer SIMD operations) + } + } + else + { + //static bool first=true; if( first ){ std::cout << "WARNING! arbitrary array" << std::endl; first=false; } // SLOWER (5.08E6) + // ?!Used to be much slower, now a tiny bit faster for AOSOA?! (5.11E6 for AOSOA, 4.64E6 for AOS in eemumu 512y) + // This does not even require AOSOA with neppM>=neppV and neppM%neppV==0 (e.g. can be used with AOS neppM==1) + constexpr int ievt0 = 0; // just make it explicit in the code that buffer refers to a given ievt0 and decoderIeppV fetches event ievt0+ieppV + auto decoderIeppv = [buffer, ip4, ipar]( int ieppV ) + -> const fptype& + { return MemoryAccessMomenta::ieventAccessIp4IparConst( buffer, ievt0 + ieppV, ip4, ipar ); }; + return mg5amcCpu::fptypevFromArbitraryArray( decoderIeppv ); // iterate over ieppV in neppV (no SIMD) + } +#endif + } + + // Is this a HostAccess or DeviceAccess class? + // [this is only needed for a warning printout in rambo.h for nparf==1 #358] + static __host__ __device__ inline constexpr bool + isOnDevice() + { + return onDevice; + } +}; + +//---------------------------------------------------------------------------- + +typedef KernelAccessMomenta HostAccessMomenta; +typedef KernelAccessMomenta DeviceAccessMomenta; + +//---------------------------------------------------------------------------- + +#endif // MemoryAccessMomenta_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessNumerators.h new file mode 100644 index 0000000000..e5f81381a9 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessNumerators.h @@ -0,0 +1,18 @@ +#ifndef MemoryAccessNumerators_H +#define MemoryAccessNumerators_H 1 +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + +#include "MemoryAccessGs.h" + +//---------------------------------------------------------------------------- + +// A class describing the internal layout of memory buffers for numerators +// This implementation reuses the plain ARRAY[nevt] implementation of MemoryAccessGs + +typedef KernelAccessGs HostAccessNumerators; +typedef KernelAccessGs DeviceAccessNumerators; + +//---------------------------------------------------------------------------- + +#endif +#endif // MemoryAccessNumerators_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h new file mode 100644 index 0000000000..a7ff24243f --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -0,0 +1,132 @@ +#ifndef MemoryAccessRandomNumbers_H +#define MemoryAccessRandomNumbers_H 1 + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessHelpers.h" + +//---------------------------------------------------------------------------- + +// A class describing the internal layout of memory buffers for random numbers +// This implementation uses an AOSOA[npagR][nparf][np4][neppR] where nevt=npagR*neppR +// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] +class MemoryAccessRandomNumbersBase //_AOSOAv1 +{ +public: /* clang-format off */ + + // Number of Events Per Page in the random number AOSOA memory buffer layout + // *** NB Different values of neppR lead to different physics results: the *** + // *** same 1d array is generated, but it is interpreted in different ways *** + static constexpr int neppR = 8; // HARDCODED TO GIVE ALWAYS THE SAME PHYSICS RESULTS! + //static constexpr int neppR = 1; // AOS (tests of sectors/requests) + +private: /* clang-format on */ + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + // The number of components of a 4-momentum + static constexpr int np4 = mgOnGpu::np4; + + // The number of final state particles in this physics process + static constexpr int nparf = mgOnGpu::nparf; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + const int ipagR = ievt / neppR; // #event "R-page" + const int ieppR = ievt % neppR; // #event in the current event R-page + constexpr int ip4 = 0; + constexpr int iparf = 0; + return &( buffer[ipagR * nparf * np4 * neppR + iparf * np4 * neppR + ip4 * neppR + ieppR] ); // AOSOA[ipagR][iparf][ip4][ieppR] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to "const int ip4, const int iparf" and rename "Field" as "Ip4Iparf"] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer, + const int ip4, + const int iparf ) + { + constexpr int ipagR = 0; + constexpr int ieppR = 0; + return buffer[ipagR * nparf * np4 * neppR + iparf * np4 * neppR + ip4 * neppR + ieppR]; // AOSOA[ipagR][iparf][ip4][ieppR] + } +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on explicit event numbers +// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations +class MemoryAccessRandomNumbers : public MemoryAccessRandomNumbersBase +{ +public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ipar, const int iparf ) <===] + static constexpr auto decodeRecordIp4Iparf = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ipar, const int iparf ) <===] + static constexpr auto decodeRecordIp4IparfConst = + MemoryAccessHelper::template decodeRecordConst; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccessIp4Iparf( fptype* buffer, const ievt, const int ipar, const int iparf ) <===] + static constexpr auto ieventAccessIp4Iparf = + MemoryAccessHelper::template ieventAccessField; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessIp4IparfConst( const fptype* buffer, const ievt, const int ipar, const int iparf ) <===] + static constexpr auto ieventAccessIp4IparfConst = + MemoryAccessHelper::template ieventAccessFieldConst; +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on implicit kernel rules +// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations +template +class KernelAccessRandomNumbers +{ +public: + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const) ===> fptype& kernelAccessIp4Iparf( fptype* buffer, const int ipar, const int iparf ) <===] + static constexpr auto kernelAccessIp4Iparf = + KernelAccessHelper::template kernelAccessField; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const) ===> const fptype& kernelAccessIp4IparfConst( const fptype* buffer, const int ipar, const int iparf ) <===] + static constexpr auto kernelAccessIp4IparfConst = + KernelAccessHelper::template kernelAccessFieldConst; +}; + +//---------------------------------------------------------------------------- + +typedef KernelAccessRandomNumbers HostAccessRandomNumbers; +typedef KernelAccessRandomNumbers DeviceAccessRandomNumbers; + +//---------------------------------------------------------------------------- + +#endif // MemoryAccessRandomNumbers_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessVectors.h new file mode 100644 index 0000000000..2697cdad52 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessVectors.h @@ -0,0 +1,122 @@ +#ifndef MemoryAccessVectors_H +#define MemoryAccessVectors_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#ifndef __CUDACC__ +namespace mg5amcCpu // this is only needed for CPU SIMD vectorization +{ + +#ifdef MGONGPU_CPPSIMD + //-------------------------------------------------------------------------- + + // Cast one non-const fptype_v reference (one vector of neppV fptype values) from one non-const fptype reference (#435), + // assuming that "pointer(evt#0)+1" indicates "pointer(evt#1)", and that the arrays are aligned + inline fptype_v& fptypevFromAlignedArray( fptype& ref ) + { + return *reinterpret_cast( &ref ); + } + + // Cast one const fptype_v reference (one vector of neppV fptype values) from one const fptype reference, + // assuming that "pointer(evt#0)+1" indicates "pointer(evt#1)", and that the arrays are aligned + inline const fptype_v& fptypevFromAlignedArray( const fptype& ref ) + { + return *reinterpret_cast( &ref ); + } + + // Build one fptype_v (one vector of neppV fptype values) from one fptype reference, + // assuming that "pointer(evt#0)+1" indicates "pointer(evt#1)", but that the arrays are not aligned + inline fptype_v fptypevFromUnalignedArray( const fptype& ref ) + { +#if MGONGPU_CPPSIMD == 2 + return fptype_v{ *( &ref ), // explicit initialization of all array elements (2) + *( &ref + 1 ) }; +#elif MGONGPU_CPPSIMD == 4 + return fptype_v{ *( &ref ), // explicit initialization of all array elements (4) + *( &ref + 1 ), + *( &ref + 2 ), + *( &ref + 3 ) }; +#elif MGONGPU_CPPSIMD == 8 + return fptype_v{ *( &ref ), // explicit initialization of all array elements (8) + *( &ref + 1 ), + *( &ref + 2 ), + *( &ref + 3 ), + *( &ref + 4 ), + *( &ref + 5 ), + *( &ref + 6 ), + *( &ref + 7 ) }; +#elif MGONGPU_CPPSIMD == 16 + return fptype_v{ *( &ref ), // explicit initialization of all array elements (16) + *( &ref + 1 ), + *( &ref + 2 ), + *( &ref + 3 ), + *( &ref + 4 ), + *( &ref + 5 ), + *( &ref + 6 ), + *( &ref + 7 ), + *( &ref + 8 ), + *( &ref + 9 ), + *( &ref + 10 ), + *( &ref + 11 ), + *( &ref + 12 ), + *( &ref + 13 ), + *( &ref + 14 ), + *( &ref + 15 ) }; +#else +#error Internal error! Unknown MGONGPU_CPPSIMD value +#endif + } + + // Build one fptype_v (one vector of neppV fptype values) from one fptype reference, + // with no a priori assumption on how the input fptype array should be decoded + template + inline fptype_v fptypevFromArbitraryArray( Functor decoderIeppv ) + { +#if MGONGPU_CPPSIMD == 2 + return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (2) + decoderIeppv( 1 ) }; +#elif MGONGPU_CPPSIMD == 4 + return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (4) + decoderIeppv( 1 ), + decoderIeppv( 2 ), + decoderIeppv( 3 ) }; +#elif MGONGPU_CPPSIMD == 8 + return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (8) + decoderIeppv( 1 ), + decoderIeppv( 2 ), + decoderIeppv( 3 ), + decoderIeppv( 4 ), + decoderIeppv( 5 ), + decoderIeppv( 6 ), + decoderIeppv( 7 ) }; +#elif MGONGPU_CPPSIMD == 16 + return fptype_v{ decoderIeppv( 0 ), // explicit initialization of all array elements (16) + decoderIeppv( 1 ), + decoderIeppv( 2 ), + decoderIeppv( 3 ), + decoderIeppv( 4 ), + decoderIeppv( 5 ), + decoderIeppv( 6 ), + decoderIeppv( 7 ), + decoderIeppv( 8 ), + decoderIeppv( 9 ), + decoderIeppv( 10 ), + decoderIeppv( 11 ), + decoderIeppv( 12 ), + decoderIeppv( 13 ), + decoderIeppv( 14 ), + decoderIeppv( 15 ) }; +#else +#error Internal error! Unknown MGONGPU_CPPSIMD value +#endif + } + + //-------------------------------------------------------------------------- +#endif + +} // end namespace +#endif + +#endif // MemoryAccessVectors_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h new file mode 100644 index 0000000000..738eef9a02 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -0,0 +1,155 @@ +#ifndef MemoryAccessWavefunctions_H +#define MemoryAccessWavefunctions_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuCxtypes.h" + +#include "MemoryAccessHelpers.h" + +#define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 + +//---------------------------------------------------------------------------- + +#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS + +// A class describing the internal layout of memory buffers for wavefunctions +// This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW +// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] +class MemoryAccessWavefunctionsBase //_AOSOAv1 +{ +public: + + // Number of Events Per Page in the wavefunction AOSOA memory buffer layout + static constexpr int neppW = 1; // AOS (just a test...) + +private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + // The number of components of a (fermion or vector) wavefunction + static constexpr int nw6 = mgOnGpu::nw6; + + // The number of floating point components of a complex number + static constexpr int nx2 = mgOnGpu::nx2; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + const int ipagW = ievt / neppW; // #event "W-page" + const int ieppW = ievt % neppW; // #event in the current event W-page + constexpr int iw6 = 0; + constexpr int ix2 = 0; + return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer, + const int iw6, + const int ix2 ) + { + constexpr int ipagW = 0; + constexpr int ieppW = 0; + return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + } +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on explicit event numbers +// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations +class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase +{ +public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] + static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] + static constexpr auto decodeRecordIw6Ix2Const = + MemoryAccessHelper::template decodeRecordConst; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] + static constexpr auto ieventAccessIw6Ix2 = + MemoryAccessHelper::template ieventAccessField; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] + static constexpr auto ieventAccessIw6Ix2Const = + MemoryAccessHelper::template ieventAccessFieldConst; +}; + +#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on implicit kernel rules +// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations +template +class KernelAccessWavefunctions +{ +public: + +#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] + static constexpr auto kernelAccessIw6Ix2 = + KernelAccessHelper::template kernelAccessField; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] + static constexpr auto kernelAccessIw6Ix2Const = + KernelAccessHelper::template kernelAccessFieldConst; + +#else + + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) + { + return reinterpret_cast( buffer ); + } + +#endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS +}; + +//---------------------------------------------------------------------------- + +typedef KernelAccessWavefunctions HostAccessWavefunctions; +typedef KernelAccessWavefunctions DeviceAccessWavefunctions; + +//---------------------------------------------------------------------------- + +#endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWeights.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWeights.h new file mode 100644 index 0000000000..3915657657 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWeights.h @@ -0,0 +1,135 @@ +#ifndef MemoryAccessWeights_H +#define MemoryAccessWeights_H 1 + +#include "mgOnGpuConfig.h" + +#include "MemoryAccessHelpers.h" + +//---------------------------------------------------------------------------- + +// A class describing the internal layout of memory buffers for weights +// This implementation uses a plain ARRAY[nevt] +// [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name] +class MemoryAccessWeightsBase //_ARRAYv1 +{ +private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + return &( buffer[ievt] ); // ARRAY[nevt] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer ) + { + constexpr int ievt = 0; + return buffer[ievt]; // ARRAY[nevt] + } +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on explicit event numbers +// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations +class MemoryAccessWeights : public MemoryAccessWeightsBase +{ +public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===] + static constexpr auto decodeRecord = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===] + static constexpr auto decodeRecordConst = + MemoryAccessHelper::template decodeRecordConst<>; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===] + static constexpr auto ieventAccess = + MemoryAccessHelper::template ieventAccessField<>; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===] + static constexpr auto ieventAccessConst = + MemoryAccessHelper::template ieventAccessFieldConst<>; +}; + +//---------------------------------------------------------------------------- + +// A class providing access to memory buffers for a given event, based on implicit kernel rules +// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations +template +class KernelAccessWeights +{ +public: + + /* + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const) ===> fptype& kernelAccess( fptype* buffer ) <===] + // FINAL IMPLEMENTATION FOR CUDA 11.4 + static constexpr auto kernelAccess = + KernelAccessHelper::template kernelAccessField<>; + */ + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const) ===> fptype& kernelAccess( fptype* buffer ) <===] + // TEMPORARY HACK FOR CUDA 11.1 + static __host__ __device__ inline fptype& + kernelAccess( fptype* buffer ) + { + return KernelAccessHelper::template kernelAccessField<>( buffer ); + } + + /* + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] + // FINAL IMPLEMENTATION FOR CUDA 11.4 + static constexpr auto kernelAccessConst = + KernelAccessHelper::template kernelAccessFieldConst<>; + */ + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] + // TEMPORARY HACK FOR CUDA 11.1 + static __host__ __device__ inline const fptype& + kernelAccessConst( const fptype* buffer ) + { + return KernelAccessHelper::template kernelAccessFieldConst<>( buffer ); + } +}; + +//---------------------------------------------------------------------------- + +typedef KernelAccessWeights HostAccessWeights; +typedef KernelAccessWeights DeviceAccessWeights; + +//---------------------------------------------------------------------------- + +#endif // MemoryAccessWeights_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h new file mode 100644 index 0000000000..5775c59793 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -0,0 +1,530 @@ +#ifndef MemoryBuffers_H +#define MemoryBuffers_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuCxtypes.h" + +#include "CudaRuntime.h" +#include "Parameters_MSSM_SLHA2.h" + +#include + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + // TEMPORARY? Take this from a PhysicsProcess class? Define them here directly in codegen? + namespace MemoryBuffers + { + static constexpr size_t np4 = mgOnGpu::np4; + static constexpr size_t nparf = mgOnGpu::nparf; + static constexpr size_t npar = mgOnGpu::npar; + static constexpr size_t nw6 = mgOnGpu::nw6; + static constexpr size_t nx2 = mgOnGpu::nx2; + static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; + } + + //-------------------------------------------------------------------------- + + // An abstract interface encapsulating a given number of events + class INumberOfEvents + { + public: + virtual ~INumberOfEvents() {} + virtual size_t nevt() const = 0; + }; + + //-------------------------------------------------------------------------- + + // A class encapsulating a given number of events + class NumberOfEvents : virtual public INumberOfEvents + { + public: + NumberOfEvents( const size_t nevt ) + : m_nevt( nevt ) {} + virtual ~NumberOfEvents() {} + virtual size_t nevt() const override { return m_nevt; } + private: + const size_t m_nevt; + }; + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer (not necessarily an event buffer) + template + class BufferBase : virtual public INumberOfEvents + { + protected: + BufferBase( const size_t size, const bool onDevice ) + : m_size( size ), m_data( nullptr ), m_isOnDevice( onDevice ) {} + virtual ~BufferBase() {} + public: + T* data() { return m_data; } + const T* data() const { return m_data; } + T& operator[]( const size_t index ) { return m_data[index]; } + const T& operator[]( const size_t index ) const { return m_data[index]; } + size_t size() const { return m_size; } + size_t bytes() const { return m_size * sizeof( T ); } + bool isOnDevice() const { return m_isOnDevice; } + virtual size_t nevt() const override { throw std::runtime_error( "This BufferBase is not an event buffer" ); } + protected: + const size_t m_size; + T* m_data; + const bool m_isOnDevice; + }; + + //-------------------------------------------------------------------------- + +#ifndef __CUDACC__ + constexpr bool HostBufferALIGNED = false; // ismisaligned=false + constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true + + // A class encapsulating a C++ host buffer + template + class HostBufferBase : public BufferBase + { + public: + HostBufferBase( const size_t size ) + : BufferBase( size, false ) + { + if constexpr( !ismisaligned ) + this->m_data = new( std::align_val_t( cppAlign ) ) T[size](); + else + this->m_data = new( std::align_val_t( cppAlign ) ) T[size + 1]() + 1; // TEST MISALIGNMENT! + } + virtual ~HostBufferBase() + { + if constexpr( !ismisaligned ) + ::operator delete[]( this->m_data, std::align_val_t( cppAlign ) ); + else + ::operator delete[]( ( this->m_data ) - 1, std::align_val_t( cppAlign ) ); // TEST MISALIGNMENT! + } + static constexpr bool isaligned() { return !ismisaligned; } + public: + static constexpr size_t cppAlign = mgOnGpu::cppAlign; + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + // A class encapsulating a CUDA pinned host buffer + template + class PinnedHostBufferBase : public BufferBase + { + public: + PinnedHostBufferBase( const size_t size ) + : BufferBase( size, false ) + { + checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + } + virtual ~PinnedHostBufferBase() + { + checkCuda( cudaFreeHost( this->m_data ) ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + // A class encapsulating a CUDA device buffer + template + class DeviceBufferBase : public BufferBase + { + public: + DeviceBufferBase( const size_t size ) + : BufferBase( size, true ) + { + checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + } + virtual ~DeviceBufferBase() + { + checkCuda( cudaFree( this->m_data ) ); + } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for a given number of events + template + class HostBuffer : public HostBufferBase, virtual private NumberOfEvents + { + public: + HostBuffer( const size_t nevt ) + : NumberOfEvents( nevt ) + , HostBufferBase( sizePerEvent * nevt ) {} + virtual ~HostBuffer() {} + virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + // A class encapsulating a CUDA pinned host buffer for a given number of events + template + class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents + { + public: + PinnedHostBuffer( const size_t nevt ) + : NumberOfEvents( nevt ) + , PinnedHostBufferBase( sizePerEvent * nevt ) {} + virtual ~PinnedHostBuffer() {} + virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } + }; +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + // A class encapsulating a CUDA device buffer for a given number of events + template + class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents + { + public: + DeviceBuffer( const size_t nevt ) + : NumberOfEvents( nevt ) + , DeviceBufferBase( sizePerEvent * nevt ) {} + virtual ~DeviceBuffer() {} + virtual size_t nevt() const override final { return NumberOfEvents::nevt(); } + }; +#endif + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for momenta random numbers + typedef BufferBase BufferRndNumMomenta; + + // The size (number of elements) per event in a memory buffer for momenta random numbers + constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for momenta random numbers + typedef HostBuffer HostBufferRndNumMomenta; +#else + // A class encapsulating a CUDA pinned host buffer for momenta random numbers + typedef PinnedHostBuffer PinnedHostBufferRndNumMomenta; + // A class encapsulating a CUDA device buffer for momenta random numbers + typedef DeviceBuffer DeviceBufferRndNumMomenta; +#endif + + //-------------------------------------------------------------------------- + + /* + // A base class encapsulating a memory buffer with ONE fptype per event + typedef BufferBase BufferOneFp; + + // The size (number of elements) per event in a memory buffer with ONE fptype per event + constexpr size_t sizePerEventOneFp = 1; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer with ONE fptype per event + typedef HostBuffer HostBufferOneFp; +#else + // A class encapsulating a CUDA pinned host buffer for gs + typedef PinnedHostBuffer PinnedHostBufferOneFp; + // A class encapsulating a CUDA device buffer for gs + typedef DeviceBuffer DeviceBufferOneFp; +#endif + + // Memory buffers for Gs (related to the event-by-event strength of running coupling constant alphas QCD) + typedef BufferOneFp BufferGs; + typedef HostBufferOneFp HostBufferGs; + typedef PinnedHostBufferOneFp PinnedHostBufferGs; + typedef DeviceBufferOneFp DeviceBufferGs; + */ + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for Gs (related to the event-by-event strength of running coupling constant alphas QCD) + typedef BufferBase BufferGs; + + // The size (number of elements) per event in a memory buffer for Gs + constexpr size_t sizePerEventGs = 1; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for gs + typedef HostBuffer HostBufferGs; +#else + // A class encapsulating a CUDA pinned host buffer for gs + typedef PinnedHostBuffer PinnedHostBufferGs; + // A class encapsulating a CUDA device buffer for gs + typedef DeviceBuffer DeviceBufferGs; +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // A base class encapsulating a memory buffer for numerators (of the multichannel single-diagram enhancement factors) + typedef BufferBase BufferNumerators; + + // The size (number of elements) per event in a memory buffer for numerators + constexpr size_t sizePerEventNumerators = 1; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for gs + typedef HostBuffer HostBufferNumerators; +#else + // A class encapsulating a CUDA pinned host buffer for gs + typedef PinnedHostBuffer PinnedHostBufferNumerators; + // A class encapsulating a CUDA device buffer for gs + typedef DeviceBuffer DeviceBufferNumerators; +#endif +#endif + + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // A base class encapsulating a memory buffer for denominators (of the multichannel single-diagram enhancement factors) + typedef BufferBase BufferDenominators; + + // The size (number of elements) per event in a memory buffer for denominators + constexpr size_t sizePerEventDenominators = 1; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for gs + typedef HostBuffer HostBufferDenominators; +#else + // A class encapsulating a CUDA pinned host buffer for gs + typedef PinnedHostBuffer PinnedHostBufferDenominators; + // A class encapsulating a CUDA device buffer for gs + typedef DeviceBuffer DeviceBufferDenominators; +#endif +#endif + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for couplings that depend on the event-by-event running coupling constant alphas QCD + typedef BufferBase BufferCouplings; + + // The size (number of elements) per event in a memory buffer for random numbers + constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for gs + typedef HostBuffer HostBufferCouplings; +#else + // A class encapsulating a CUDA pinned host buffer for gs + typedef PinnedHostBuffer PinnedHostBufferCouplings; + // A class encapsulating a CUDA device buffer for gs + typedef DeviceBuffer DeviceBufferCouplings; +#endif + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for momenta + typedef BufferBase BufferMomenta; + + // The size (number of elements) per event in a memory buffer for momenta + constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for momenta + typedef HostBuffer HostBufferMomenta; + //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! +#else + // A class encapsulating a CUDA pinned host buffer for momenta + typedef PinnedHostBuffer PinnedHostBufferMomenta; + // A class encapsulating a CUDA device buffer for momenta + typedef DeviceBuffer DeviceBufferMomenta; +#endif + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for sampling weights + typedef BufferBase BufferWeights; + + // The size (number of elements) per event in a memory buffer for sampling weights + constexpr size_t sizePerEventWeights = 1; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for sampling weights + typedef HostBuffer HostBufferWeights; +#else + // A class encapsulating a CUDA pinned host buffer for sampling weights + typedef PinnedHostBuffer PinnedHostBufferWeights; + // A class encapsulating a CUDA device buffer for sampling weights + typedef DeviceBuffer DeviceBufferWeights; +#endif + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for matrix elements + typedef BufferBase BufferMatrixElements; + + // The size (number of elements) per event in a memory buffer for matrix elements + constexpr size_t sizePerEventMatrixElements = 1; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for matrix elements + typedef HostBuffer HostBufferMatrixElements; +#else + // A class encapsulating a CUDA pinned host buffer for matrix elements + typedef PinnedHostBuffer PinnedHostBufferMatrixElements; + // A class encapsulating a CUDA device buffer for matrix elements + typedef DeviceBuffer DeviceBufferMatrixElements; +#endif + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for the helicity mask + typedef BufferBase BufferHelicityMask; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for the helicity mask + typedef HostBufferBase HostBufferHelicityMask; +#else + // A class encapsulating a CUDA pinned host buffer for the helicity mask + typedef PinnedHostBufferBase PinnedHostBufferHelicityMask; + // A class encapsulating a CUDA device buffer for the helicity mask + typedef DeviceBufferBase DeviceBufferHelicityMask; +#endif + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for wavefunctions + typedef BufferBase BufferWavefunctions; + + // The size (number of elements) per event in a memory buffer for wavefunctions + constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for wavefunctions + typedef HostBuffer HostBufferWavefunctions; +#else + // A class encapsulating a CUDA pinned host buffer for wavefunctions + typedef PinnedHostBuffer PinnedHostBufferWavefunctions; + // A class encapsulating a CUDA device buffer for wavefunctions + typedef DeviceBuffer DeviceBufferWavefunctions; +#endif + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for helicity random numbers + typedef BufferBase BufferRndNumHelicity; + + // The size (number of elements) per event in a memory buffer for helicity random numbers + constexpr size_t sizePerEventRndNumHelicity = 1; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for helicity random numbers + typedef HostBuffer HostBufferRndNumHelicity; +#else + // A class encapsulating a CUDA pinned host buffer for helicity random numbers + typedef PinnedHostBuffer PinnedHostBufferRndNumHelicity; + // A class encapsulating a CUDA device buffer for helicity random numbers + typedef DeviceBuffer DeviceBufferRndNumHelicity; +#endif + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for color random numbers + typedef BufferBase BufferRndNumColor; + + // The size (number of elements) per event in a memory buffer for color random numbers + constexpr size_t sizePerEventRndNumColor = 1; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for color random numbers + typedef HostBuffer HostBufferRndNumColor; +#else + // A class encapsulating a CUDA pinned host buffer for color random numbers + typedef PinnedHostBuffer PinnedHostBufferRndNumColor; + // A class encapsulating a CUDA device buffer for color random numbers + typedef DeviceBuffer DeviceBufferRndNumColor; +#endif + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for helicity selection + typedef BufferBase BufferSelectedHelicity; + + // The size (number of elements) per event in a memory buffer for helicity selection + constexpr size_t sizePerEventSelectedHelicity = 1; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for helicity selection + typedef HostBuffer HostBufferSelectedHelicity; +#else + // A class encapsulating a CUDA pinned host buffer for helicity selection + typedef PinnedHostBuffer PinnedHostBufferSelectedHelicity; + // A class encapsulating a CUDA device buffer for helicity selection + typedef DeviceBuffer DeviceBufferSelectedHelicity; +#endif + + //-------------------------------------------------------------------------- + + // A base class encapsulating a memory buffer for color selection + typedef BufferBase BufferSelectedColor; + + // The size (number of elements) per event in a memory buffer for color selection + constexpr size_t sizePerEventSelectedColor = 1; + +#ifndef __CUDACC__ + // A class encapsulating a C++ host buffer for color selection + typedef HostBuffer HostBufferSelectedColor; +#else + // A class encapsulating a CUDA pinned host buffer for color selection + typedef PinnedHostBuffer PinnedHostBufferSelectedColor; + // A class encapsulating a CUDA device buffer for color selection + typedef DeviceBuffer DeviceBufferSelectedColor; +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + template + void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy + { + if( dst.size() != src.size() ) + { + std::ostringstream sstr; + sstr << "Size (#elements) mismatch in copyDeviceFromHost: dst=" << dst.size() << ", src=" << src.size(); + throw std::runtime_error( sstr.str() ); + } + if( dst.bytes() != src.bytes() ) + { + std::ostringstream sstr; + sstr << "Size (#bytes) mismatch in copyDeviceFromHost: dst=" << dst.bytes() << ", src=" << src.bytes(); + throw std::runtime_error( sstr.str() ); + } + // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array + checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + template + void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy + { + if( dst.size() != src.size() ) + { + std::ostringstream sstr; + sstr << "Size (#elements) mismatch in copyHostFromDevice: dst=" << dst.size() << ", src=" << src.size(); + throw std::runtime_error( sstr.str() ); + } + if( dst.bytes() != src.bytes() ) + { + std::ostringstream sstr; + sstr << "Size (#bytes) mismatch in copyHostFromDevice: dst=" << dst.bytes() << ", src=" << src.bytes(); + throw std::runtime_error( sstr.str() ); + } + // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array + checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + + //-------------------------------------------------------------------------- +} + +#endif // MemoryBuffers_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/Bridge.h new file mode 120000 index 0000000000..7afe008f47 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/Bridge.h @@ -0,0 +1 @@ +../Bridge.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/BridgeKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/BridgeKernels.cc new file mode 120000 index 0000000000..4c8697458f --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/BridgeKernels.cc @@ -0,0 +1 @@ +../BridgeKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/BridgeKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/BridgeKernels.h new file mode 120000 index 0000000000..f21b556a84 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/BridgeKernels.h @@ -0,0 +1 @@ +../BridgeKernels.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CMakeLists.txt b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CMakeLists.txt new file mode 100644 index 0000000000..4ac6c179d3 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CMakeLists.txt @@ -0,0 +1,24 @@ +get_filename_component(basename ${CMAKE_CURRENT_SOURCE_DIR} NAME) +string(TOLOWER ${basename} targadd) + +file(GLOB_RECURSE HEADERS "../*.h" CPPProcess.h) +set(SOURCES ../BridgeKernels.cc CPPProcess.cc ../CrossSectionKernels.cc + ../MatrixElementKernels.cc ../RamboSamplingKernels.cc + ../RandomNumberKernels.cc) + +set(libname mg5amc_cxx_${targadd}) +add_library(${libname} ${SOURCES} ${HEADERS}) +target_include_directories(${libname} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}" + "${PROJECT_SOURCE_DIR}/src" + "${PROJECT_GITROOT_DIR}/tools") + +set(execname check_${targadd}.exe) +add_executable(${execname} check_sa.cc) +target_link_libraries(${execname} PUBLIC mg5amc_common ${libname}) +target_include_directories(${execname} PRIVATE "${PROJECT_SOURCE_DIR}/src") + +# some XCode specific stuff to make the executable run +set_property(TARGET ${libname} PROPERTY XCODE_GENERATE_SCHEME TRUE) +set_property(TARGET ${execname} PROPERTY XCODE_GENERATE_SCHEME TRUE) +set_property(TARGET ${execname} PROPERTY XCODE_SCHEME_ARGUMENTS "--bridge" "8" "8" "32") +set_property(TARGET ${execname} PROPERTY XCODE_SCHEME_WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}") diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc new file mode 100644 index 0000000000..a32c83489a --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -0,0 +1,1084 @@ +//========================================================================== +// This file has been automatically generated for CUDA/C++ standalone by +// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26 +// By the MadGraph5_aMC@NLO Development Team +// Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch +//========================================================================== + +#include "CPPProcess.h" + +#include "mgOnGpuConfig.h" + +#include "CudaRuntime.h" +#include "HelAmps_MSSM_SLHA2.h" +#include "MemoryAccessAmplitudes.h" +#include "MemoryAccessCouplings.h" +#include "MemoryAccessCouplingsFixed.h" +#include "MemoryAccessGs.h" +#include "MemoryAccessMatrixElements.h" +#include "MemoryAccessMomenta.h" +#include "MemoryAccessWavefunctions.h" + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#include "MemoryAccessDenominators.h" +#include "MemoryAccessNumerators.h" +#include "coloramps.h" +#endif + +#include +#include +#include +#include +#include + +// Test ncu metrics for CUDA thread divergence +#undef MGONGPU_TEST_DIVERGENCE +//#define MGONGPU_TEST_DIVERGENCE 1 + +//========================================================================== +// Class member functions for calculating the matrix elements for +// Process: g g > t t~ WEIGHTED<=2 @1 + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + using mgOnGpu::np4; // dimensions of 4-momenta (E,px,py,pz) + using mgOnGpu::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + using mgOnGpu::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + + using mgOnGpu::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + using mgOnGpu::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + + using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) + using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) + + // The number of colors + constexpr int ncolor = 2; + + // The number of SIMD vectors of events processed by calculate_wavefunction +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + constexpr int nParity = 2; +#else + constexpr int nParity = 1; +#endif + + // Physics parameters (masses, coupling, etc...) + // For CUDA performance, hardcoded constexpr's would be better: fewer registers and a tiny throughput increase + // However, physics parameters are user-defined through card files: use CUDA constant memory instead (issue #39) + // [NB if hardcoded parameters are used, it's better to define them here to avoid silent shadowing (issue #263)] +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const fptype cIPD[2] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT }; + __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 +#else +#ifdef __CUDACC__ + __device__ __constant__ fptype cIPD[2]; + __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 +#else + static fptype cIPD[2]; + static fptype* cIPC = nullptr; // unused as nicoup=0 +#endif +#endif + + // Helicity combinations (and filtering of "good" helicity combinations) +#ifdef __CUDACC__ + __device__ __constant__ short cHel[ncomb][npar]; + __device__ __constant__ int cNGoodHel; // FIXME: assume process.nprocesses == 1 for the moment (eventually cNGoodHel[nprocesses]?) + __device__ __constant__ int cGoodHel[ncomb]; +#else + static short cHel[ncomb][npar]; + static int cNGoodHel; // FIXME: assume process.nprocesses == 1 for the moment (eventually cNGoodHel[nprocesses]?) + static int cGoodHel[ncomb]; +#endif + + //-------------------------------------------------------------------------- + + // Evaluate |M|^2 for each subprocess + // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) + // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) + // In CUDA, this device function computes the ME for a single event + // In C++, this function computes the ME for a single event "page" or SIMD vector (or for two in "mixed" precision mode, nParity=2) + __device__ INLINE void /* clang-format off */ + calculate_wavefunctions( int ihel, + const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allMEs, // output: allMEs[nevt], |M|^2 running_sum_over_helicities +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: multichannel channel id (1 to #diagrams); 0 to disable channel enhancement + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities +#endif + fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) +#ifndef __CUDACC__ + , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) +#endif + ) + //ALWAYS_INLINE // attributes are not permitted in a function definition + { +#ifdef __CUDACC__ + using namespace mg5amcGpu; + using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events + using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events + using W_ACCESS = DeviceAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event + using A_ACCESS = DeviceAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event + using CD_ACCESS = DeviceAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = DeviceAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = DeviceAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = DeviceAccessDenominators; // non-trivial access: buffer includes all events +#endif +#else + using namespace mg5amcCpu; + using M_ACCESS = HostAccessMomenta; // non-trivial access: buffer includes all events + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events + using W_ACCESS = HostAccessWavefunctions; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event + using A_ACCESS = HostAccessAmplitudes; // TRIVIAL ACCESS (no kernel splitting yet): buffer for one event + using CD_ACCESS = HostAccessCouplings; // non-trivial access (dependent couplings): buffer includes all events + using CI_ACCESS = HostAccessCouplingsFixed; // TRIVIAL access (independent couplings): buffer for one event +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events +#endif +#endif /* clang-format on */ + mgDebug( 0, __FUNCTION__ ); + //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); +#ifndef __CUDACC__ + //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); +#endif + + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) + // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] + // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need + // ** NB: to have large memory structurs for wavefunctions/amplitudes in all events (no kernel splitting yet)! + //MemoryBufferWavefunctions w_buffer[nwf]{ neppV }; + cxtype_sv w_sv[nwf][nw6]; // particle wavefunctions within Feynman diagrams (nw6 is often 6, the dimension of spin 1/2 or spin 1 particles) + cxtype_sv amp_sv[1]; // invariant amplitude for one given Feynman diagram + + // Proof of concept for using fptype* in the interface + fptype* w_fp[nwf]; + for( int iwf = 0; iwf < nwf; iwf++ ) w_fp[iwf] = reinterpret_cast( w_sv[iwf] ); + fptype* amp_fp; + amp_fp = reinterpret_cast( amp_sv ); + + // Local variables for the given CUDA event (ievt) or C++ event page (ipagV) + // [jamp: sum (for one event or event page) of the invariant amplitudes for all Feynman diagrams in a given color combination] + cxtype_sv jamp_sv[ncolor] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxtype is NOT, if "= {}" is missing!) + + // === Calculate wavefunctions and amplitudes for all diagrams in all processes === + // === (for one event in CUDA, for one - or two in mixed mode - SIMD event pages in C++ === +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed fptypes #537: float for color algebra and double elsewhere + // Delay color algebra and ME updates (only on even pages) + cxtype_sv jamp_sv_previous[ncolor] = {}; + fptype* MEs_previous = 0; +#endif + for( int iParity = 0; iParity < nParity; ++iParity ) + { // START LOOP ON IPARITY +#ifndef __CUDACC__ + const int ievt0 = ievt00 + iParity * neppV; +#endif + constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings + const fptype* allCOUPs[nxcoup]; +#ifdef __CUDACC__ +#pragma nv_diagnostic push +#pragma nv_diag_suppress 186 // e.g. <> +#endif + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event + for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) + allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef __CUDACC__ +#pragma nv_diagnostic pop + // CUDA kernels take input/output buffers with momenta/MEs for all events + const fptype* momenta = allmomenta; + const fptype* COUPs[nxcoup]; + for( size_t ixcoup = 0; ixcoup < nxcoup; ixcoup++ ) COUPs[ixcoup] = allCOUPs[ixcoup]; + fptype* MEs = allMEs; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + fptype* numerators = allNumerators; + fptype* denominators = allDenominators; +#endif +#else + // C++ kernels take input/output buffers with momenta/MEs for one specific event (the first in the current event page) + const fptype* momenta = M_ACCESS::ieventAccessRecordConst( allmomenta, ievt0 ); + const fptype* COUPs[nxcoup]; + for( size_t idcoup = 0; idcoup < ndcoup; idcoup++ ) + COUPs[idcoup] = CD_ACCESS::ieventAccessRecordConst( allCOUPs[idcoup], ievt0 ); // dependent couplings, vary event-by-event + for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) + COUPs[ndcoup + iicoup] = allCOUPs[ndcoup + iicoup]; // independent couplings, fixed for all events + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); +#endif +#endif + + // Reset color flows (reset jamp_sv) at the beginning of a new event or event page + for( int i = 0; i < ncolor; i++ ) { jamp_sv[i] = cxzero_sv(); } + +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Numerators and denominators for the current event (CUDA) or SIMD event page (C++) + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); +#endif + + // *** DIAGRAM 1 OF 3 *** + + // Wavefunction(s) for diagram number 1 + vxxxxx( momenta, 0., cHel[ihel][0], -1, w_fp[0], 0 ); + + vxxxxx( momenta, 0., cHel[ihel][1], -1, w_fp[1], 1 ); + + oxxxxx( momenta, cIPD[0], cHel[ihel][2], +1, w_fp[2], 2 ); + + ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); + + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 0., 0., w_fp[4] ); + + // Amplitude(s) for diagram number 1 + FFV1_0( w_fp[3], w_fp[2], w_fp[4], -COUPs[1], &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + jamp_sv[0] += cxtype( 0, 1 ) * amp_sv[0]; + jamp_sv[1] -= cxtype( 0, 1 ) * amp_sv[0]; + + // *** DIAGRAM 2 OF 3 *** + + // Wavefunction(s) for diagram number 2 + FFV1_1( w_fp[2], w_fp[0], -COUPs[1], cIPD[0], cIPD[1], w_fp[4] ); + + // Amplitude(s) for diagram number 2 + FFV1_0( w_fp[3], w_fp[4], w_fp[1], -COUPs[1], &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + jamp_sv[0] -= amp_sv[0]; + + // *** DIAGRAM 3 OF 3 *** + + // Wavefunction(s) for diagram number 3 + FFV1_2( w_fp[3], w_fp[0], -COUPs[1], cIPD[0], cIPD[1], w_fp[4] ); + + // Amplitude(s) for diagram number 3 + FFV1_0( w_fp[4], w_fp[2], w_fp[1], -COUPs[1], &_fp[0] ); +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) +#endif + jamp_sv[1] -= amp_sv[0]; + + // *** COLOR CHOICE BELOW *** + // Store the leading color flows for choice of color + if( jamp2_sv ) // disable color choice if nullptr + for( int icolC = 0; icolC < ncolor; icolC++ ) + jamp2_sv[ncolor * iParity + icolC] += cxabs2( jamp_sv[icolC] ); + + // *** COLOR MATRIX BELOW *** + // (This method used to be called CPPProcess::matrix_1_gg_ttx()?) + + // The color denominators (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 denom[ncolor] = { 3, 3 }; // 1-D array[2] + + // The color matrix (initialize all array elements, with ncolor=2) + // [NB do keep 'static' for these constexpr arrays, see issue #283] + static constexpr fptype2 cf[ncolor][ncolor] = { + { 16, -2 }, + { -2, 16 } }; // 2-D array[2][2] + +#ifndef __CUDACC__ + // Pre-compute a constexpr triangular color matrix properly normalized #475 + struct TriangularNormalizedColorMatrix + { + // See https://stackoverflow.com/a/34465458 + __host__ __device__ constexpr TriangularNormalizedColorMatrix() + : value() + { + for( int icol = 0; icol < ncolor; icol++ ) + { + // Diagonal terms + value[icol][icol] = cf[icol][icol] / denom[icol]; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + value[icol][jcol] = 2 * cf[icol][jcol] / denom[icol]; + } + } + fptype2 value[ncolor][ncolor]; + }; + static constexpr auto cf2 = TriangularNormalizedColorMatrix(); +#endif + +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + if( iParity == 0 ) // NB: first page is 0! skip even pages, compute on odd pages + { + // Mixed fptypes: delay color algebra and ME updates to next (odd) ipagV + for( int icol = 0; icol < ncolor; icol++ ) + jamp_sv_previous[icol] = jamp_sv[icol]; + MEs_previous = MEs; + continue; // go to next iParity in the loop: skip color algebra and ME update on odd pages + } + fptype_sv deltaMEs_previous = { 0 }; +#endif + + // Sum and square the color flows to get the matrix element + // (compute |M|^2 by squaring |M|, taking into account colours) + // Sum and square the color flows to get the matrix element + // (compute |M|^2 by squaring |M|, taking into account colours) + fptype_sv deltaMEs = { 0 }; // all zeros https://en.cppreference.com/w/c/language/array_initialization#Notes + + // Use the property that M is a real matrix (see #475): + // we can rewrite the quadratic form (A-iB)(M)(A+iB) as AMA - iBMA + iBMA + BMB = AMA + BMB + // In addition, on C++ use the property that M is symmetric (see #475), + // and also use constexpr to compute "2*" and "/denom[icol]" once and for all at compile time: + // we gain (not a factor 2...) in speed here as we only loop over the up diagonal part of the matrix. + // Strangely, CUDA is slower instead, so keep the old implementation for the moment. +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv jampR_sv[ncolor] = { 0 }; + fptype2_sv jampI_sv[ncolor] = { 0 }; + for( int icol = 0; icol < ncolor; icol++ ) + { + jampR_sv[icol] = fpvmerge( cxreal( jamp_sv_previous[icol] ), cxreal( jamp_sv[icol] ) ); + jampI_sv[icol] = fpvmerge( cximag( jamp_sv_previous[icol] ), cximag( jamp_sv[icol] ) ); + } +#endif + for( int icol = 0; icol < ncolor; icol++ ) + { +#ifndef __CUDACC__ + // === C++ START === + // Diagonal terms +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRi_sv = jampR_sv[icol]; + fptype2_sv& jampIi_sv = jampI_sv[icol]; +#else + fptype2_sv jampRi_sv = (fptype2_sv)( cxreal( jamp_sv[icol] ) ); + fptype2_sv jampIi_sv = (fptype2_sv)( cximag( jamp_sv[icol] ) ); +#endif + fptype2_sv ztempR_sv = cf2.value[icol][icol] * jampRi_sv; + fptype2_sv ztempI_sv = cf2.value[icol][icol] * jampIi_sv; + // Off-diagonal terms + for( int jcol = icol + 1; jcol < ncolor; jcol++ ) + { +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype2_sv& jampRj_sv = jampR_sv[jcol]; + fptype2_sv& jampIj_sv = jampI_sv[jcol]; +#else + fptype2_sv jampRj_sv = (fptype2_sv)( cxreal( jamp_sv[jcol] ) ); + fptype2_sv jampIj_sv = (fptype2_sv)( cximag( jamp_sv[jcol] ) ); +#endif + ztempR_sv += cf2.value[icol][jcol] * jampRj_sv; + ztempI_sv += cf2.value[icol][jcol] * jampIj_sv; + } + fptype2_sv deltaMEs2 = ( jampRi_sv * ztempR_sv + jampIi_sv * ztempI_sv ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + deltaMEs_previous += fpvsplit0( deltaMEs2 ); + deltaMEs += fpvsplit1( deltaMEs2 ); +#else + deltaMEs += deltaMEs2; +#endif + // === C++ END === +#else + // === CUDA START === + fptype2_sv ztempR_sv = { 0 }; + fptype2_sv ztempI_sv = { 0 }; + for( int jcol = 0; jcol < ncolor; jcol++ ) + { + fptype2_sv jampRj_sv = cxreal( jamp_sv[jcol] ); + fptype2_sv jampIj_sv = cximag( jamp_sv[jcol] ); + ztempR_sv += cf[icol][jcol] * jampRj_sv; + ztempI_sv += cf[icol][jcol] * jampIj_sv; + } + deltaMEs += ( ztempR_sv * cxreal( jamp_sv[icol] ) + ztempI_sv * cximag( jamp_sv[icol] ) ) / denom[icol]; + // === CUDA END === +#endif + } + + // *** STORE THE RESULTS *** + + // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) + // FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?) + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv += deltaMEs; // fix #435 +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv& MEs_sv_previous = E_ACCESS::kernelAccess( MEs_previous ); + MEs_sv_previous += deltaMEs_previous; +#endif + /* +#ifdef __CUDACC__ + if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); +#else +#ifdef MGONGPU_CPPSIMD + if( cNGoodHel > 0 ) + for( int ieppV = 0; ieppV < neppV; ieppV++ ) + printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0 + ieppV, ihel, MEs_sv[ieppV] ); +#else + if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", ievt0, ihel, MEs_sv ); +#endif +#endif + */ + } // END LOOP ON IPARITY + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + CPPProcess::CPPProcess( bool verbose, + bool debug ) + : m_verbose( verbose ) + , m_debug( debug ) +#ifndef MGONGPU_HARDCODE_PARAM + , m_pars( 0 ) +#endif + , m_masses() + { + // Helicities for the process [NB do keep 'static' for this constexpr array, see issue #283] + // *** NB There is no automatic check yet that these are in the same order as Fortran! #569 *** + static constexpr short tHel[ncomb][mgOnGpu::npar] = { + { -1, -1, -1, 1 }, + { -1, -1, -1, -1 }, + { -1, -1, 1, 1 }, + { -1, -1, 1, -1 }, + { -1, 1, -1, 1 }, + { -1, 1, -1, -1 }, + { -1, 1, 1, 1 }, + { -1, 1, 1, -1 }, + { 1, -1, -1, 1 }, + { 1, -1, -1, -1 }, + { 1, -1, 1, 1 }, + { 1, -1, 1, -1 }, + { 1, 1, -1, 1 }, + { 1, 1, -1, -1 }, + { 1, 1, 1, 1 }, + { 1, 1, 1, -1 } }; +#ifdef __CUDACC__ + checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) ) ); +#else + memcpy( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) ); +#endif + } + + //-------------------------------------------------------------------------- + + CPPProcess::~CPPProcess() {} + + //-------------------------------------------------------------------------- + +#ifndef MGONGPU_HARDCODE_PARAM + // Initialize process (with parameters read from user cards) + void + CPPProcess::initProc( const std::string& param_card_name ) + { + // Instantiate the model class and set parameters that stay fixed during run + m_pars = Parameters_MSSM_SLHA2::getInstance(); + SLHAReader slha( param_card_name, m_verbose ); + m_pars->setIndependentParameters( slha ); + m_pars->setIndependentCouplings(); + //m_pars->setDependentParameters(); // now computed event-by-event (running alphas #373) + //m_pars->setDependentCouplings(); // now computed event-by-event (running alphas #373) + if( m_verbose ) + { + m_pars->printIndependentParameters(); + m_pars->printIndependentCouplings(); + //m_pars->printDependentParameters(); // now computed event-by-event (running alphas #373) + //m_pars->printDependentCouplings(); // now computed event-by-event (running alphas #373) + } + // Set external particle masses for this matrix element + m_masses.push_back( m_pars->ZERO ); + m_masses.push_back( m_pars->ZERO ); + m_masses.push_back( m_pars->mdl_MT ); + m_masses.push_back( m_pars->mdl_MT ); + // Read physics parameters like masses and couplings from user configuration files (static: initialize once) + // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory + const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; + //const cxtype tIPC[0] = { ... }; // nicoup=0 +#ifdef __CUDACC__ + checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); + //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#else + memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); + //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 +#endif + //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + } +#else + // Initialize process (with hardcoded parameters) + void + CPPProcess::initProc( const std::string& /*param_card_name*/ ) + { + // Use hardcoded physics parameters + if( m_verbose ) + { + Parameters_MSSM_SLHA2::printIndependentParameters(); + Parameters_MSSM_SLHA2::printIndependentCouplings(); + //Parameters_MSSM_SLHA2::printDependentParameters(); // now computed event-by-event (running alphas #373) + //Parameters_MSSM_SLHA2::printDependentCouplings(); // now computed event-by-event (running alphas #373) + } + // Set external particle masses for this matrix element + m_masses.push_back( Parameters_MSSM_SLHA2::ZERO ); + m_masses.push_back( Parameters_MSSM_SLHA2::ZERO ); + m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT ); + m_masses.push_back( Parameters_MSSM_SLHA2::mdl_MT ); + } +#endif + + //-------------------------------------------------------------------------- + + // Retrieve the compiler that was used to build this module + const std::string + CPPProcess::getCompiler() + { + std::stringstream out; + // CUDA version (NVCC) + // [Use __NVCC__ instead of __CUDACC__ here!] + // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] + // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] +#ifdef __NVCC__ +#if defined __CUDACC_VER_MAJOR__ && defined __CUDACC_VER_MINOR__ && defined __CUDACC_VER_BUILD__ + out << "nvcc " << __CUDACC_VER_MAJOR__ << "." << __CUDACC_VER_MINOR__ << "." << __CUDACC_VER_BUILD__; +#else + out << "nvcc UNKNOWN"; +#endif + out << " ("; +#endif + // ICX version (either as CXX or as host compiler inside NVCC) +#if defined __INTEL_COMPILER +#error "icc is no longer supported: please use icx" +#elif defined __INTEL_LLVM_COMPILER // alternative: __INTEL_CLANG_COMPILER + out << "icx " << __INTEL_LLVM_COMPILER; +#ifdef __NVCC__ + out << ", "; +#else + out << " ("; +#endif +#endif + // CLANG version (either as CXX or as host compiler inside NVCC or inside ICX) +#if defined __clang__ +#if defined __clang_major__ && defined __clang_minor__ && defined __clang_patchlevel__ +#ifdef __APPLE__ + out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; +#else + out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + // GCC toolchain version inside CLANG + std::string tchainout; + std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; + std::unique_ptr tchainpipe( popen( tchaincmd.c_str(), "r" ), pclose ); + if( !tchainpipe ) throw std::runtime_error( "`readelf ...` failed?" ); + std::array tchainbuf; + while( fgets( tchainbuf.data(), tchainbuf.size(), tchainpipe.get() ) != nullptr ) tchainout += tchainbuf.data(); + tchainout.pop_back(); // remove trailing newline +#if defined __NVCC__ or defined __INTEL_LLVM_COMPILER + out << ", gcc " << tchainout; +#else + out << " (gcc " << tchainout << ")"; +#endif +#endif +#else + out << "clang UNKNOWKN"; +#endif +#else + // GCC version (either as CXX or as host compiler inside NVCC) +#if defined __GNUC__ && defined __GNUC_MINOR__ && defined __GNUC_PATCHLEVEL__ + out << "gcc " << __GNUC__ << "." << __GNUC_MINOR__ << "." << __GNUC_PATCHLEVEL__; +#else + out << "gcc UNKNOWKN"; +#endif +#endif +#if defined __NVCC__ or defined __INTEL_LLVM_COMPILER + out << ")"; +#endif + return out.str(); + } + + //-------------------------------------------------------------------------- + + __global__ void /* clang-format off */ + computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] + fptype* allcouplings // output: couplings[nevt*ndcoup*2] +#ifndef __CUDACC__ + , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#endif + ) /* clang-format on */ + { +#ifdef __CUDACC__ + using namespace mg5amcGpu; + using G_ACCESS = DeviceAccessGs; + using C_ACCESS = DeviceAccessCouplings; + G2COUP( allgs, allcouplings ); +#else + using namespace mg5amcCpu; + using G_ACCESS = HostAccessGs; + using C_ACCESS = HostAccessCouplings; + for( int ipagV = 0; ipagV < nevt / neppV; ++ipagV ) + { + const int ievt0 = ipagV * neppV; + const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); + fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); + G2COUP( gs, couplings ); + } +#endif + } + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ /* clang-format off */ + __global__ void + sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities +#endif + bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) + { /* clang-format on */ + // FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?) + fptype allMEsLast = 0; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + allMEs[ievt] = 0; + for( int ihel = 0; ihel < ncomb; ihel++ ) + { + // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) + constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + constexpr unsigned int channelId = 0; // disable single-diagram channel enhancement + calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); +#else + calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); +#endif + if( allMEs[ievt] != allMEsLast ) + { + //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl; + isGoodHel[ihel] = true; + } + allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt + } + } +#else + void + sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities +#endif + bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) + const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) + { + //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] + //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] + // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs) + constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start) + fptype allMEsLast[maxtry0] = { 0 }; // allocated at build time: maxtry0 must be a constexpr + // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV) + assert( nevt >= neppV ); + const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt allMEs[nevt*nprocesses]?) + + __global__ void /* clang-format off */ + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection + const fptype* allrndcol, // input: random numbers[nevt] for color selection + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: multichannel channel id (1 to #diagrams); 0 to disable channel enhancement + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities +#endif + int* allselhel, // output: helicity selection[nevt] + int* allselcol // output: helicity selection[nevt] +#ifndef __CUDACC__ + , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#endif + ) /* clang-format on */ + { + mgDebugInitialise(); + + // Denominators: spins, colors and identical particles + constexpr int nprocesses = 1; + static_assert( nprocesses == 1, "Assume nprocesses == 1" ); // FIXME (#343): assume nprocesses == 1 + constexpr int helcolDenominators[1] = { 256 }; + +#ifdef __CUDACC__ + // Remember: in CUDA this is a kernel for one event, in c++ this processes n events + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid +#else + //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] + //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] + using E_ACCESS = HostAccessMatrixElements; // non-trivial access: buffer includes all events +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + using NUM_ACCESS = HostAccessNumerators; // non-trivial access: buffer includes all events + using DEN_ACCESS = HostAccessDenominators; // non-trivial access: buffer includes all events +#endif +#endif + + // Start sigmaKin_lines + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === + // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event + // FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?) +#ifdef __CUDACC__ + allMEs[ievt] = 0; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + allNumerators[ievt] = 0; + allDenominators[ievt] = 0; +#endif +#else + const int npagV = nevt / neppV; + for( int ipagV = 0; ipagV < npagV; ++ipagV ) + { + const int ievt0 = ipagV * neppV; + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv = fptype_sv{ 0 }; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + numerators_sv = fptype_sv{ 0 }; + denominators_sv = fptype_sv{ 0 }; +#endif + } +#endif + + // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === + // (in both CUDA and C++, using precomputed good helicities) + // FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here#ifdef __CUDACC__ + +#ifdef __CUDACC__ // CUDA OR C++ + + // *** START OF PART 1a - CUDA (one event per CPU thread) *** + // Running sum of partial amplitudes squared for event by event color selection (#402) + // (for the single event processed in calculate_wavefunctions) + fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; + fptype MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for this event) + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) + { + const int ihel = cGoodHel[ighel]; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv ); +#else + calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); +#endif + MEs_ighel[ighel] = allMEs[ievt]; + } + // Event-by-event random choice of helicity #403 + //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) + { + if( allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ) ) + { + const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + // Event-by-event random choice of color #402 + const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0 + fptype targetamp[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + break; + } + } +#endif + // *** END OF PART 1a - CUDA (one event per CPU thread) *** + +#else // CUDA OR C++ + + // *** START OF PART 1b - C++ (loop on event pages) +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + // Mixed fptypes #537: float for color algebra and double elsewhere + // Delay color algebra and ME updates (only on even pages) + assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page + const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time +#else + const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time +#endif +#ifdef _OPENMP + // OMP multithreading #575 (NB: tested only with gcc11 so far) + // See https://www.openmp.org/specifications/ + // - default(none): no variables are shared by default + // - shared: as the name says + // - private: give each thread its own copy, without initialising + // - firstprivate: give each thread its own copy, and initialise with value from outside +#define _OMPLIST0 allcouplings, allMEs, allmomenta, allrndcol, allrndhel, allselcol, allselhel, cGoodHel, cNGoodHel, npagV2 +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL +#define _OMPLIST1 , allDenominators, allNumerators, channelId, mgOnGpu::icolamp +#else +#define _OMPLIST1 +#endif +#pragma omp parallel for default( none ) shared( _OMPLIST0 _OMPLIST1 ) +#undef _OMPLIST0 +#undef _OMPLIST1 +#endif // _OPENMP + for( int ipagV2 = 0; ipagV2 < npagV2; ++ipagV2 ) + { + // Running sum of partial amplitudes squared for event by event color selection (#402) + // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_wavefunctions) + fptype_sv jamp2_sv[nParity * ncolor] = { 0 }; + fptype_sv MEs_ighel[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page) +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv MEs_ighel2[ncomb] = { 0 }; // sum of MEs for all good helicities up to ighel (for the second neppV page) + const int ievt00 = ipagV2 * neppV * 2; // loop on two SIMD pages (neppV events) at a time +#else + const int ievt00 = ipagV2 * neppV; // loop on one SIMD page (neppV events) at a time +#endif + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) + { + const int ihel = cGoodHel[ighel]; +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, channelId, allNumerators, allDenominators, jamp2_sv, ievt00 ); +#else + calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv, ievt00 ); +#endif + MEs_ighel[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 ) ); +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + MEs_ighel2[ighel] = E_ACCESS::kernelAccess( E_ACCESS::ieventAccessRecord( allMEs, ievt00 + neppV ) ); +#endif + } + // Event-by-event random choice of helicity #403 + for( int ieppV = 0; ieppV < neppV; ++ieppV ) + { + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt, allrndhel[ievt] ); + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) + { +#if defined MGONGPU_CPPSIMD + const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel][ieppV] / MEs_ighel[cNGoodHel - 1][ieppV] ); +#else + const bool okhel = allrndhel[ievt] < ( MEs_ighel[ighel] / MEs_ighel[cNGoodHel - 1] ); +#endif + if( okhel ) + { + const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt] = ihelF; + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + const int ievt2 = ievt00 + ieppV + neppV; + //printf( "sigmaKin: ievt=%4d rndhel=%f\n", ievt2, allrndhel[ievt2] ); + for( int ighel = 0; ighel < cNGoodHel; ighel++ ) + { + if( allrndhel[ievt2] < ( MEs_ighel2[ighel][ieppV] / MEs_ighel2[cNGoodHel - 1][ieppV] ) ) + { + const int ihelF = cGoodHel[ighel] + 1; // NB Fortran [1,ncomb], cudacpp [0,ncomb-1] + allselhel[ievt2] = ihelF; + //printf( "sigmaKin: ievt=%4d ihel=%4d\n", ievt, ihelF ); + break; + } + } +#endif + } +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) + const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0 + // Event-by-event random choice of color #402 + fptype_sv targetamp[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = fptype_sv{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + fptype_sv targetamp2[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp2[icolC] = fptype_sv{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + } +#endif + for( int ieppV = 0; ieppV < neppV; ++ieppV ) + { + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { +#if defined MGONGPU_CPPSIMD + const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); +#else + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); +#endif + if( okcol ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + break; + } + } +#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + const int ievt2 = ievt00 + ieppV + neppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + { + allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + break; + } + } +#endif + } +#endif // multichannel enabled (random color choice) + } + // *** END OF PART 1b - C++ (loop on event pages) + +#endif // CUDA or C++ + + // === PART 2 - FINALISATION (after calculate_wavefunctions) === + // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event + // [NB 'sum over final spins, average over initial spins', eg see + // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] + // FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?) +#ifdef __CUDACC__ + allMEs[ievt] /= helcolDenominators[0]; // FIXME (#343): assume nprocesses == 1 +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // FIXME (#343): assume nprocesses == 1 +#endif +#else + for( int ipagV = 0; ipagV < npagV; ++ipagV ) + { + const int ievt0 = ipagV * neppV; + fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); + fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); + MEs_sv /= helcolDenominators[0]; // FIXME (#343): assume nprocesses == 1 +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + if( channelId > 0 ) + { + fptype* numerators = NUM_ACCESS::ieventAccessRecord( allNumerators, ievt0 ); + fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); + fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); + fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); + MEs_sv *= numerators_sv / denominators_sv; // FIXME (#343): assume nprocesses == 1 + } +#endif + //for( int ieppV = 0; ieppV < neppV; ieppV++ ) + //{ + // const unsigned int ievt = ipagV * neppV + ieppV; + // printf( "sigmaKin: ievt=%2d me=%f\n", ievt, allMEs[ievt] ); + //} + } +#endif + mgDebugFinalise(); + } + + //-------------------------------------------------------------------------- + +} // end namespace + +//========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h new file mode 100644 index 0000000000..76e0e2bdf3 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h @@ -0,0 +1,166 @@ +//========================================================================== +// This file has been automatically generated for CUDA/C++ standalone by +// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26 +// By the MadGraph5_aMC@NLO Development Team +// Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch +//========================================================================== + +#ifndef MG5_Sigma_MSSM_SLHA2_gg_ttx_H +#define MG5_Sigma_MSSM_SLHA2_gg_ttx_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "Parameters_MSSM_SLHA2.h" + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //========================================================================== + // A class for calculating the matrix elements for + // Process: g g > t t~ WEIGHTED<=2 @1 + //-------------------------------------------------------------------------- + + class CPPProcess + { + public: /* clang-format off */ + + // Constructor (from command line arguments) + CPPProcess( bool verbose = false, bool debug = false ); + + // Destructor + ~CPPProcess(); + + // Initialize process (read model parameters from file) + virtual void initProc( const std::string& param_card_name ); + + // Retrieve the compiler that was used to build this module + static const std::string getCompiler(); + + // Other methods of this instance (???) + //const std::vector& getMasses() const { return m_masses; } + //virtual int code() const{ return 1; } + //void setInitial( int inid1, int inid2 ){ id1 = inid1; id2 = inid2; } + //int getDim() const { return dim; } + //int getNIOParticles() const { return nexternal; } // nexternal was nioparticles + + // Accessors (unused so far: add four of them only to fix a clang build warning) + //bool verbose() const { return m_verbose; } + bool debug() const { return m_debug; } + + public: /* clang-format on */ + + // Hardcoded parameters for this process (constant class variables) + //static const int ninitial = mgOnGpu::npari; + //static const int nexternal = 4; // mgOnGpu::npar (nexternal was nioparticles) + //static const int nprocesses = 1; // FIXME: assume process.nprocesses == 1 + //static const int nwavefuncs = 6; // mgOnGpu::nwf + //static const int namplitudes = 3; + //static const int ncomb = 16; // mgOnGpu::ncomb + + private: + + // Command line arguments (constructor) + bool m_verbose; + bool m_debug; + + // Physics model parameters to be read from file (initProc function) +#ifndef MGONGPU_HARDCODE_PARAM + Parameters_MSSM_SLHA2* m_pars; +#endif + std::vector m_masses; // external particle masses + + // Other variables of this instance (???) + //int id1, id2; // initial particle ids + //cxtype** amp; // ??? + }; + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + __global__ void + computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] + fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] +#else + __global__ void + computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] + fptype* allcouplings, // output: couplings[nevt*ndcoup*2] + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ /* clang-format off */ + __global__ void + sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities +#endif + bool* isGoodHel ); // output: isGoodHel[ncomb] - device array (CUDA implementation) +#else + __global__ void + sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities +#endif + bool* isGoodHel, // output: isGoodHel[ncomb] - host array (C++ implementation) + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#endif /* clang-format on */ + + //-------------------------------------------------------------------------- + + int // output: nGoodHel (the number of good helicity combinations out of ncomb) + sigmaKin_setGoodHel( const bool* isGoodHel ); // input: isGoodHel[ncomb] - host array + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ /* clang-format off */ + __global__ void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection + const fptype* allrndcol, // input: random numbers[nevt] for color selection + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: multichannel channel id (1 to #diagrams); 0 to disable channel enhancement + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities +#endif + int* allselhel, // output: helicity selection[nevt] + int* allselcol // output: helicity selection[nevt] + ); +#else + __global__ void + sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] + const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] + const fptype* allrndhel, // input: random numbers[nevt] for helicity selection + const fptype* allrndcol, // input: random numbers[nevt] for color selection + fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities +#ifdef MGONGPU_SUPPORTS_MULTICHANNEL + const unsigned int channelId, // input: multichannel channel id (1 to #diagrams); 0 to disable channel enhancement + fptype* allNumerators, // output: multichannel numerators[nevt], running_sum_over_helicities + fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities +#endif + int* allselhel, // output: helicity selection[nevt] + int* allselcol, // output: helicity selection[nevt] + const int nevt ); // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) +#endif /* clang-format on */ + + //-------------------------------------------------------------------------- +} + +#endif // MG5_Sigma_MSSM_SLHA2_gg_ttx_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CrossSectionKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CrossSectionKernels.cc new file mode 120000 index 0000000000..d9cb57c4bb --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CrossSectionKernels.cc @@ -0,0 +1 @@ +../CrossSectionKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CrossSectionKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CrossSectionKernels.h new file mode 120000 index 0000000000..125b8758e4 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CrossSectionKernels.h @@ -0,0 +1 @@ +../CrossSectionKernels.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CudaRuntime.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CudaRuntime.h new file mode 120000 index 0000000000..ce9e1a487a --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CudaRuntime.h @@ -0,0 +1 @@ +../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/EventStatistics.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/EventStatistics.h new file mode 120000 index 0000000000..34c1a31129 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/EventStatistics.h @@ -0,0 +1 @@ +../EventStatistics.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MadgraphTest.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MadgraphTest.h new file mode 120000 index 0000000000..13942d64c4 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MadgraphTest.h @@ -0,0 +1 @@ +../MadgraphTest.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MatrixElementKernels.cc new file mode 120000 index 0000000000..f800cb9638 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MatrixElementKernels.cc @@ -0,0 +1 @@ +../MatrixElementKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MatrixElementKernels.h new file mode 120000 index 0000000000..ac47855d4f --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MatrixElementKernels.h @@ -0,0 +1 @@ +../MatrixElementKernels.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessAmplitudes.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessAmplitudes.h new file mode 120000 index 0000000000..448995d3e5 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessAmplitudes.h @@ -0,0 +1 @@ +../MemoryAccessAmplitudes.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessCouplings.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessCouplings.h new file mode 120000 index 0000000000..388f907580 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessCouplings.h @@ -0,0 +1 @@ +../MemoryAccessCouplings.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessCouplingsFixed.h new file mode 120000 index 0000000000..c795c16465 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessCouplingsFixed.h @@ -0,0 +1 @@ +../MemoryAccessCouplingsFixed.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessDenominators.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessDenominators.h new file mode 120000 index 0000000000..4ab752bdad --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessDenominators.h @@ -0,0 +1 @@ +../MemoryAccessDenominators.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessGs.h new file mode 120000 index 0000000000..9d5e237faf --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessGs.h @@ -0,0 +1 @@ +../MemoryAccessGs.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessHelpers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessHelpers.h new file mode 120000 index 0000000000..3692f9e4da --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessHelpers.h @@ -0,0 +1 @@ +../MemoryAccessHelpers.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessMatrixElements.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessMatrixElements.h new file mode 120000 index 0000000000..b04a26e4f6 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessMatrixElements.h @@ -0,0 +1 @@ +../MemoryAccessMatrixElements.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessMomenta.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessMomenta.h new file mode 120000 index 0000000000..4a5e8b375d --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessMomenta.h @@ -0,0 +1 @@ +../MemoryAccessMomenta.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessNumerators.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessNumerators.h new file mode 120000 index 0000000000..a525b6607d --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessNumerators.h @@ -0,0 +1 @@ +../MemoryAccessNumerators.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessRandomNumbers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessRandomNumbers.h new file mode 120000 index 0000000000..844de324e7 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessRandomNumbers.h @@ -0,0 +1 @@ +../MemoryAccessRandomNumbers.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessVectors.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessVectors.h new file mode 120000 index 0000000000..d890503974 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessVectors.h @@ -0,0 +1 @@ +../MemoryAccessVectors.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessWavefunctions.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessWavefunctions.h new file mode 120000 index 0000000000..61a331899b --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessWavefunctions.h @@ -0,0 +1 @@ +../MemoryAccessWavefunctions.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessWeights.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessWeights.h new file mode 120000 index 0000000000..ec10cd2e17 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryAccessWeights.h @@ -0,0 +1 @@ +../MemoryAccessWeights.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryBuffers.h new file mode 120000 index 0000000000..600b7ad779 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/MemoryBuffers.h @@ -0,0 +1 @@ +../MemoryBuffers.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RamboSamplingKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RamboSamplingKernels.cc new file mode 120000 index 0000000000..033b20955e --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RamboSamplingKernels.cc @@ -0,0 +1 @@ +../RamboSamplingKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RamboSamplingKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RamboSamplingKernels.h new file mode 120000 index 0000000000..ca354ce496 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RamboSamplingKernels.h @@ -0,0 +1 @@ +../RamboSamplingKernels.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.cc new file mode 120000 index 0000000000..09a0e03a16 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.cc @@ -0,0 +1 @@ +../RandomNumberKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.h new file mode 120000 index 0000000000..5e8526a6ae --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.h @@ -0,0 +1 @@ +../RandomNumberKernels.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/check_sa.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/check_sa.cc new file mode 100644 index 0000000000..f91ee8ebfb --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/check_sa.cc @@ -0,0 +1,1120 @@ +#include "mgOnGpuConfig.h" + +#include "BridgeKernels.h" +#include "CPPProcess.h" +#include "CrossSectionKernels.h" +#include "MatrixElementKernels.h" +#include "MemoryAccessMatrixElements.h" +#include "MemoryAccessMomenta.h" +#include "MemoryAccessRandomNumbers.h" +#include "MemoryAccessWeights.h" +#include "MemoryBuffers.h" +#include "RamboSamplingKernels.h" +#include "RandomNumberKernels.h" +#include "epoch_process_id.h" +#include "ompnumthreads.h" +#include "timermap.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define STRINGIFY( s ) #s +#define XSTRINGIFY( s ) STRINGIFY( s ) + +#define SEP79 79 + +bool +is_number( const char* s ) +{ + const char* t = s; + while( *t != '\0' && isdigit( *t ) ) + ++t; + return (int)strlen( s ) == t - s; +} + +int +usage( char* argv0, int ret = 1 ) +{ + std::cout << "Usage: " << argv0 + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; + std::cout << std::endl; + std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; + std::cout << "(also in CPU/C++ code, where only the product of these two parameters counts)" << std::endl; + std::cout << std::endl; + std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; + std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; +#ifndef __CUDACC__ +#ifdef _OPENMP + std::cout << std::endl; + std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; + std::cout << "(OMP multithreading will be disabled if OMP_NUM_THREADS is not set)" << std::endl; +#endif +#endif + return ret; +} + +int +main( int argc, char** argv ) +{ + // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) +#ifdef __CUDACC__ + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + // DEFAULTS FOR COMMAND LINE ARGUMENTS + bool verbose = false; + bool debug = false; + bool perf = false; + bool json = false; + unsigned int niter = 0; + unsigned int gpublocks = 1; + unsigned int gputhreads = 32; + unsigned int jsondate = 0; + unsigned int jsonrun = 0; + unsigned int numvec[5] = { 0, 0, 0, 0, 0 }; + int nnum = 0; + // Random number mode + enum class RandomNumberMode + { + CommonRandom = 0, + CurandHost = 1, + CurandDevice = 2 + }; +#ifdef __CUDACC__ + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU +#elif not defined MGONGPU_HAS_NO_CURAND + RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand +#endif + // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) + enum class RamboSamplingMode + { + RamboHost = 1, + RamboDevice = 2 + }; +#ifdef __CUDACC__ + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU +#else + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU +#endif + // Bridge emulation mode (NB Bridge implies RamboHost!) + bool bridge = false; + + // READ COMMAND LINE ARGUMENTS + for( int argn = 1; argn < argc; ++argn ) + { + std::string arg = argv[argn]; + if( ( arg == "--verbose" ) || ( arg == "-v" ) ) + { + verbose = true; + } + else if( ( arg == "--debug" ) || ( arg == "-d" ) ) + { + debug = true; + } + else if( ( arg == "--performance" ) || ( arg == "-p" ) ) + { + perf = true; + } + else if( ( arg == "--json" ) || ( arg == "-j" ) ) + { + json = true; + } + else if( arg == "--curdev" ) + { +#ifdef __CUDACC__ + rndgen = RandomNumberMode::CurandDevice; +#else + throw std::runtime_error( "CurandDevice is not supported on CPUs" ); +#endif + } + else if( arg == "--curhst" ) + { +#ifndef MGONGPU_HAS_NO_CURAND + rndgen = RandomNumberMode::CurandHost; +#else + throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); +#endif + } + else if( arg == "--common" ) + { + rndgen = RandomNumberMode::CommonRandom; + } + else if( arg == "--rmbdev" ) + { +#ifdef __CUDACC__ + rmbsmp = RamboSamplingMode::RamboDevice; +#else + throw std::runtime_error( "RamboDevice is not supported on CPUs" ); +#endif + } + else if( arg == "--rmbhst" ) + { + rmbsmp = RamboSamplingMode::RamboHost; + } + else if( arg == "--bridge" ) + { + bridge = true; + } + else if( is_number( argv[argn] ) && nnum < 5 ) + { + numvec[nnum++] = strtoul( argv[argn], NULL, 0 ); + } + else + { + return usage( argv[0] ); + } + } + + if( nnum == 3 || nnum == 5 ) + { + gpublocks = numvec[0]; + gputhreads = numvec[1]; + niter = numvec[2]; + if( nnum == 5 ) + { + jsondate = numvec[3]; + jsonrun = numvec[4]; + } + } + else if( nnum == 1 ) + { + niter = numvec[0]; + } + else + { + return usage( argv[0] ); + } + + if( niter == 0 ) + return usage( argv[0] ); + + if( bridge && rmbsmp == RamboSamplingMode::RamboDevice ) + { + std::cout << "WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost" << std::endl; + rmbsmp = RamboSamplingMode::RamboHost; + } + + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::CurandDevice ) + { +#if not defined MGONGPU_HAS_NO_CURAND + std::cout << "WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost" << std::endl; + rndgen = RandomNumberMode::CurandHost; +#else + std::cout << "WARNING! RamboHost selected: cannot use CurandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout + + using mgOnGpu::ntpbMAX; + if( gputhreads > ntpbMAX ) + { + std::cout << "ERROR! #threads/block should be <= " << ntpbMAX << std::endl; + return usage( argv[0] ); + } + +#ifndef __CUDACC__ +#ifdef _OPENMP + ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) +#endif +#endif + +#ifndef __CUDACC__ + // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation + // Note: this prevents a crash on pmpe04 but not on some github CI nodes? + // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] + if( !MatrixElementKernelHost::hostSupportsSIMD() ) return 1; +#endif + + const unsigned int ndim = gpublocks * gputhreads; // number of threads in one GPU grid + const unsigned int nevt = ndim; // number of events in one iteration == number of GPU threads + + if( verbose ) + std::cout << "# iterations: " << niter << std::endl; + + // *** START THE NEW TIMERS *** + mgOnGpu::TimerMap timermap; + + // === STEP 0 - INITIALISE + +#ifdef __CUDACC__ + + // --- 00. Initialise cuda + // Instantiate a CudaRuntime at the beginnining of the application's main to + // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + const std::string cdinKey = "00 CudaInit"; + timermap.start( cdinKey ); + CudaRuntime cudaRuntime( debug ); +#endif + + // --- 0a. Initialise physics process + const std::string procKey = "0a ProcInit"; + timermap.start( procKey ); + + // Create a process object + CPPProcess process( verbose ); + + // Read param_card and set parameters + process.initProc( "../../Cards/param_card.dat" ); + const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak) + //const fptype energy = 0.100; // Ecms = 100 MeV (well below the Z peak, pure em scattering) + const int meGeVexponent = -( 2 * mgOnGpu::npar - 8 ); + + // --- 0b. Allocate memory structures + const std::string alloKey = "0b MemAlloc"; + timermap.start( alloKey ); + + // Memory buffers for random numbers for momenta +#ifndef __CUDACC__ + HostBufferRndNumMomenta hstRndmom( nevt ); +#else + PinnedHostBufferRndNumMomenta hstRndmom( nevt ); + DeviceBufferRndNumMomenta devRndmom( nevt ); +#endif + + // Memory buffers for sampling weights +#ifndef __CUDACC__ + HostBufferWeights hstWeights( nevt ); +#else + PinnedHostBufferWeights hstWeights( nevt ); + DeviceBufferWeights devWeights( nevt ); +#endif + + // Memory buffers for momenta +#ifndef __CUDACC__ + HostBufferMomenta hstMomenta( nevt ); +#else + PinnedHostBufferMomenta hstMomenta( nevt ); + DeviceBufferMomenta devMomenta( nevt ); +#endif + + // Memory buffers for Gs +#ifndef __CUDACC__ + HostBufferGs hstGs( nevt ); +#else + PinnedHostBufferGs hstGs( nevt ); + DeviceBufferGs devGs( nevt ); +#endif + + // Hardcode Gs for now (eventually they should come from Fortran MadEvent) + for( unsigned int i = 0; i < nevt; ++i ) + { + constexpr fptype fixedG = 1.2177157847767195; // fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) + hstGs[i] = fixedG; + //if ( i > 0 ) hstGs[i] = 0; // try hardcoding G only for event 0 + //hstGs[i] = i; + } + + // Memory buffers for matrix elements +#ifndef __CUDACC__ + HostBufferMatrixElements hstMatrixElements( nevt ); +#else + PinnedHostBufferMatrixElements hstMatrixElements( nevt ); + DeviceBufferMatrixElements devMatrixElements( nevt ); +#endif + + // Memory buffers for random numbers for helicity selection + // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** +#ifndef __CUDACC__ + HostBufferRndNumHelicity hstRndHel( nevt ); +#else + PinnedHostBufferRndNumHelicity hstRndHel( nevt ); + DeviceBufferRndNumHelicity devRndHel( nevt ); +#endif + + // Memory buffers for random numbers for color selection + // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** +#ifndef __CUDACC__ + HostBufferRndNumColor hstRndCol( nevt ); +#else + PinnedHostBufferRndNumColor hstRndCol( nevt ); + DeviceBufferRndNumColor devRndCol( nevt ); +#endif + + // Memory buffers for helicity selection +#ifndef __CUDACC__ + HostBufferSelectedHelicity hstSelHel( nevt ); +#else + PinnedHostBufferSelectedHelicity hstSelHel( nevt ); + DeviceBufferSelectedHelicity devSelHel( nevt ); +#endif + + // Memory buffers for color selection +#ifndef __CUDACC__ + HostBufferSelectedColor hstSelCol( nevt ); +#else + PinnedHostBufferSelectedColor hstSelCol( nevt ); + DeviceBufferSelectedColor devSelCol( nevt ); +#endif + + std::unique_ptr genrtimes( new double[niter] ); + std::unique_ptr rambtimes( new double[niter] ); + std::unique_ptr wavetimes( new double[niter] ); + std::unique_ptr wv3atimes( new double[niter] ); + + // --- 0c. Create curand or common generator + const std::string cgenKey = "0c GenCreat"; + timermap.start( cgenKey ); + // Allocate the appropriate RandomNumberKernel + std::unique_ptr prnk; + if( rndgen == RandomNumberMode::CommonRandom ) + { + prnk.reset( new CommonRandomNumberKernel( hstRndmom ) ); + } +#ifndef MGONGPU_HAS_NO_CURAND + else if( rndgen == RandomNumberMode::CurandHost ) + { + const bool onDevice = false; + prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); + } +#ifdef __CUDACC__ + else + { + const bool onDevice = true; + prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); + } +#else + else + { + throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + } +#endif +#else + else + { + throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) + } +#endif + + // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] + std::unique_ptr prsk; + if( rmbsmp == RamboSamplingMode::RamboHost ) + { + prsk.reset( new RamboSamplingKernelHost( energy, hstRndmom, hstMomenta, hstWeights, nevt ) ); + } + else + { +#ifdef __CUDACC__ + prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); +#else + throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) +#endif + } + + // --- 0c. Create matrix element kernel [keep this in 0c for the moment] + std::unique_ptr pmek; + if( !bridge ) + { +#ifdef __CUDACC__ + pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); +#else + pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); +#endif + } + else + { +#ifdef __CUDACC__ + pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); +#else + pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); +#endif + } + int nGoodHel = 0; // the number of good helicities (out of ncomb) + + // --- 0c. Create cross section kernel [keep this in 0c for the moment] + EventStatistics hstStats; + CrossSectionKernelHost xsk( hstWeights, hstMatrixElements, hstStats, nevt ); + + // ************************************** + // *** START MAIN LOOP ON #ITERATIONS *** + // ************************************** + + for( unsigned long int iiter = 0; iiter < niter; ++iiter ) + { + //std::cout << "Iteration #" << iiter+1 << " of " << niter << std::endl; + + // === STEP 1 OF 3 + + // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** + double genrtime = 0; + + // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // [NB This should not be necessary using the host API: "Generation functions + // can be called multiple times on the same generator to generate successive + // blocks of results. For pseudorandom generators, multiple calls to generation + // functions will yield the same result as a single call with a large size."] + const unsigned long long seed = 20200805; + const std::string sgenKey = "1a GenSeed "; + timermap.start( sgenKey ); + prnk->seedGenerator( seed + iiter ); + genrtime += timermap.stop(); + + // --- 1b. Generate all relevant numbers to build nevt events (i.e. nevt phase space points) on the host + const std::string rngnKey = "1b GenRnGen"; + timermap.start( rngnKey ); + prnk->generateRnarray(); + //std::cout << "Got random numbers" << std::endl; + +#ifdef __CUDACC__ + if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) + { + // --- 1c. Copy rndmom from host to device + const std::string htodKey = "1c CpHTDrnd"; + genrtime += timermap.start( htodKey ); + copyDeviceFromHost( devRndmom, hstRndmom ); + } +#endif + + // *** STOP THE OLD-STYLE TIMER FOR RANDOM GEN *** + genrtime += timermap.stop(); + + // === STEP 2 OF 3 + // Fill in particle momenta for each of nevt events on the device + + // *** START THE OLD-STYLE TIMER FOR RAMBO *** + double rambtime = 0; + + // --- 2a. Fill in momenta of initial state particles on the device + const std::string riniKey = "2a RamboIni"; + timermap.start( riniKey ); + prsk->getMomentaInitial(); + //std::cout << "Got initial momenta" << std::endl; + + // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device + // (i.e. map random numbers to final-state particle momenta for each of nevt events) + const std::string rfinKey = "2b RamboFin"; + rambtime += timermap.start( rfinKey ); + prsk->getMomentaFinal(); + //std::cout << "Got final momenta" << std::endl; + +#ifdef __CUDACC__ + if( rmbsmp == RamboSamplingMode::RamboDevice ) + { + // --- 2c. CopyDToH Weights + const std::string cwgtKey = "2c CpDTHwgt"; + rambtime += timermap.start( cwgtKey ); + copyHostFromDevice( hstWeights, devWeights ); + + // --- 2d. CopyDToH Momenta + const std::string cmomKey = "2d CpDTHmom"; + rambtime += timermap.start( cmomKey ); + copyHostFromDevice( hstMomenta, devMomenta ); + } + else // only if ( ! bridge ) ??? + { + // --- 2c. CopyHToD Weights + const std::string cwgtKey = "2c CpHTDwgt"; + rambtime += timermap.start( cwgtKey ); + copyDeviceFromHost( devWeights, hstWeights ); + + // --- 2d. CopyHToD Momenta + const std::string cmomKey = "2d CpHTDmom"; + rambtime += timermap.start( cmomKey ); + copyDeviceFromHost( devMomenta, hstMomenta ); + } +#endif + + // *** STOP THE OLD-STYLE TIMER FOR RAMBO *** + rambtime += timermap.stop(); + + // === STEP 3 OF 3 + // Evaluate matrix elements for all nevt events + // 0d. For Bridge only, transpose C2F [renamed as 0d: this is not initialisation, but I want it out of the ME timers (#371)] + // 0e. (Only on the first iteration) Get good helicities [renamed as 0e: this IS initialisation!] + // 3a. Evaluate MEs on the device (include transpose F2C for Bridge) + // 3b. Copy MEs back from device to host + + // --- 0d. TransC2F + if( bridge ) + { + const std::string tc2fKey = "0d TransC2F"; + timermap.start( tc2fKey ); + dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); + } + +#ifdef __CUDACC__ + // --- 2d. CopyHToD Momenta + const std::string gKey = "0.. CpHTDg"; + rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! + copyDeviceFromHost( devGs, hstGs ); +#endif + + // --- 0e. SGoodHel + if( iiter == 0 ) + { + const std::string ghelKey = "0e SGoodHel"; + timermap.start( ghelKey ); + nGoodHel = pmek->computeGoodHelicities(); + } + + // *** START THE OLD-STYLE TIMERS FOR MATRIX ELEMENTS (WAVEFUNCTIONS) *** + double wavetime = 0; // calc plus copy + double wv3atime = 0; // calc only + + // --- 3a. SigmaKin + const std::string skinKey = "3a SigmaKin"; + timermap.start( skinKey ); + constexpr unsigned int channelId = 0; // TEMPORARY? disable multi-channel in check.exe and gcheck.exe #466 + pmek->computeMatrixElements( channelId ); + + // *** STOP THE NEW OLD-STYLE TIMER FOR MATRIX ELEMENTS (WAVEFUNCTIONS) *** + wv3atime += timermap.stop(); // calc only + wavetime += wv3atime; // calc plus copy + +#ifdef __CUDACC__ + if( !bridge ) + { + // --- 3b. CopyDToH MEs + const std::string cmesKey = "3b CpDTHmes"; + timermap.start( cmesKey ); + copyHostFromDevice( hstMatrixElements, devMatrixElements ); + // *** STOP THE OLD OLD-STYLE TIMER FOR MATRIX ELEMENTS (WAVEFUNCTIONS) *** + wavetime += timermap.stop(); // calc plus copy + } +#endif + + // === STEP 4 FINALISE LOOP + // --- 4@ Update event statistics + const std::string updtKey = "4@ UpdtStat"; + timermap.start( updtKey ); + xsk.updateEventStatistics(); + + // --- 4a Dump within the loop + const std::string loopKey = "4a DumpLoop"; + timermap.start( loopKey ); + genrtimes[iiter] = genrtime; + rambtimes[iiter] = rambtime; + wavetimes[iiter] = wavetime; + wv3atimes[iiter] = wv3atime; + + if( verbose ) + { + std::cout << std::string( SEP79, '*' ) << std::endl + << "Iteration #" << iiter + 1 << " of " << niter << std::endl; + if( perf ) std::cout << "Wave function time: " << wavetime << std::endl; + } + + for( unsigned int ievt = 0; ievt < nevt; ++ievt ) // Loop over all events in this iteration + { + if( verbose ) + { + // Display momenta + std::cout << "Momenta:" << std::endl; + for( int ipar = 0; ipar < mgOnGpu::npar; ipar++ ) + { + // NB: 'setw' affects only the next field (of any type) + std::cout << std::scientific // fixed format: affects all floats (default precision: 6) + << std::setw( 4 ) << ipar + 1 + << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 0, ipar ) + << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 1, ipar ) + << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 2, ipar ) + << std::setw( 14 ) << MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, 3, ipar ) + << std::endl + << std::defaultfloat; // default format: affects all floats + } + std::cout << std::string( SEP79, '-' ) << std::endl; + // Display matrix elements + std::cout << " Matrix element = " << MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ) + << " GeV^" << meGeVexponent << std::endl; // FIXME: assume process.nprocesses == 1 + std::cout << std::string( SEP79, '-' ) << std::endl; + } + } + + if( !( verbose || debug || perf ) ) + { + std::cout << "."; + } + } + + // ************************************** + // *** END MAIN LOOP ON #ITERATIONS *** + // ************************************** + + // === STEP 8 ANALYSIS + // --- 8a Analysis: compute stats after the loop + const std::string statKey = "8a CompStat"; + timermap.start( statKey ); + + double sumgtim = 0; + //double sqsgtim = 0; + double mingtim = genrtimes[0]; + double maxgtim = genrtimes[0]; + for( unsigned int iiter = 0; iiter < niter; ++iiter ) + { + sumgtim += genrtimes[iiter]; + //sqsgtim += genrtimes[iiter]*genrtimes[iiter]; + mingtim = std::min( mingtim, genrtimes[iiter] ); + maxgtim = std::max( maxgtim, genrtimes[iiter] ); + } + + double sumrtim = 0; + //double sqsrtim = 0; + double minrtim = rambtimes[0]; + double maxrtim = rambtimes[0]; + for( unsigned int iiter = 0; iiter < niter; ++iiter ) + { + sumrtim += rambtimes[iiter]; + //sqsrtim += rambtimes[iiter]*rambtimes[iiter]; + minrtim = std::min( minrtim, rambtimes[iiter] ); + maxrtim = std::max( maxrtim, rambtimes[iiter] ); + } + + double sumwtim = 0; + //double sqswtim = 0; + double minwtim = wavetimes[0]; + double maxwtim = wavetimes[0]; + for( unsigned int iiter = 0; iiter < niter; ++iiter ) + { + sumwtim += wavetimes[iiter]; + //sqswtim += wavetimes[iiter]*wavetimes[iiter]; + minwtim = std::min( minwtim, wavetimes[iiter] ); + maxwtim = std::max( maxwtim, wavetimes[iiter] ); + } + double meanwtim = sumwtim / niter; + //double stdwtim = std::sqrt( sqswtim / niter - meanwtim * meanwtim ); + + double sumw3atim = 0; + //double sqsw3atim = 0; + double minw3atim = wv3atimes[0]; + double maxw3atim = wv3atimes[0]; + for( unsigned int iiter = 0; iiter < niter; ++iiter ) + { + sumw3atim += wv3atimes[iiter]; + //sqsw3atim += wv3atimes[iiter]*wv3atimes[iiter]; + minw3atim = std::min( minw3atim, wv3atimes[iiter] ); + maxw3atim = std::max( maxw3atim, wv3atimes[iiter] ); + } + double meanw3atim = sumw3atim / niter; + //double stdw3atim = std::sqrt( sqsw3atim / niter - meanw3atim * meanw3atim ); + + const unsigned int nevtALL = hstStats.nevtALL; // total number of ALL events in all iterations + if( nevtALL != niter * nevt ) + std::cout << "ERROR! nevtALL mismatch " << nevtALL << " != " << niter * nevt << std::endl; // SANITY CHECK + int nabn = hstStats.nevtABN; + int nzero = hstStats.nevtZERO; + + // === STEP 9 FINALISE + + std::string rndgentxt; + if( rndgen == RandomNumberMode::CommonRandom ) + rndgentxt = "COMMON RANDOM HOST"; + else if( rndgen == RandomNumberMode::CurandHost ) + rndgentxt = "CURAND HOST"; + else if( rndgen == RandomNumberMode::CurandDevice ) + rndgentxt = "CURAND DEVICE"; +#ifdef __CUDACC__ + rndgentxt += " (CUDA code)"; +#else + rndgentxt += " (C++ code)"; +#endif + + // Workflow description summary + std::string wrkflwtxt; + // -- CUDA or C++? +#ifdef __CUDACC__ + wrkflwtxt += "CUD:"; +#else + wrkflwtxt += "CPP:"; +#endif + // -- DOUBLE or FLOAT? +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) +#elif defined MGONGPU_FPTYPE_DOUBLE + wrkflwtxt += "DBL+"; +#elif defined MGONGPU_FPTYPE_FLOAT + wrkflwtxt += "FLT+"; +#else + wrkflwtxt += "???+"; // no path to this statement +#endif + // -- CUCOMPLEX or THRUST or STD complex numbers? +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_CUCOMPLEX + wrkflwtxt += "CUX:"; +#elif defined MGONGPU_CUCXTYPE_THRUST + wrkflwtxt += "THX:"; +#elif defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif +#else +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX + wrkflwtxt += "STX:"; +#elif defined MGONGPU_CPPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif +#endif + // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? + if( rndgen == RandomNumberMode::CommonRandom ) + wrkflwtxt += "COMMON+"; + else if( rndgen == RandomNumberMode::CurandHost ) + wrkflwtxt += "CURHST+"; + else if( rndgen == RandomNumberMode::CurandDevice ) + wrkflwtxt += "CURDEV+"; + else + wrkflwtxt += "??????+"; // no path to this statement + // -- HOST or DEVICE rambo sampling? + if( rmbsmp == RamboSamplingMode::RamboHost ) + wrkflwtxt += "RMBHST+"; + else if( rmbsmp == RamboSamplingMode::RamboDevice ) + wrkflwtxt += "RMBDEV+"; + else + wrkflwtxt += "??????+"; // no path to this statement +#ifdef __CUDACC__ + // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? + if( !bridge ) + wrkflwtxt += "MESDEV"; + else + wrkflwtxt += "BRDDEV"; +#else + if( !bridge ) + wrkflwtxt += "MESHST"; // FIXME! allow this also in CUDA (eventually with various simd levels) + else + wrkflwtxt += "BRDHST"; +#endif + // -- SIMD matrix elements? +#if !defined MGONGPU_CPPSIMD + wrkflwtxt += "/none"; +#elif defined __AVX512VL__ +#ifdef MGONGPU_PVW512 + wrkflwtxt += "/512z"; +#else + wrkflwtxt += "/512y"; +#endif +#elif defined __AVX2__ + wrkflwtxt += "/avx2"; +#elif defined __SSE4_2__ +#ifdef __PPC__ + wrkflwtxt += "/ppcv"; +#elif defined __ARM_NEON__ + wrkflwtxt += "/neon"; +#else + wrkflwtxt += "/sse4"; +#endif +#else + wrkflwtxt += "/????"; // no path to this statement +#endif + // -- Has cxtype_v::operator[] bracket with non-const reference? +#if defined MGONGPU_CPPSIMD +#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK + wrkflwtxt += "+CXVBRK"; +#else + wrkflwtxt += "+NOVBRK"; +#endif +#else + wrkflwtxt += "+NAVBRK"; // N/A +#endif + + // --- 9a Dump to screen + const std::string dumpKey = "9a DumpScrn"; + timermap.start( dumpKey ); + + if( !( verbose || debug || perf ) ) + { + std::cout << std::endl; + } + + if( perf ) + { +#ifndef __CUDACC__ +#ifdef _OPENMP + // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) + std::string nprocall; + std::unique_ptr nprocpipe( popen( "nproc --all", "r" ), pclose ); + if( !nprocpipe ) throw std::runtime_error( "`nproc --all` failed?" ); + std::array nprocbuf; + while( fgets( nprocbuf.data(), nprocbuf.size(), nprocpipe.get() ) != nullptr ) nprocall += nprocbuf.data(); +#endif +#endif +#ifdef MGONGPU_CPPSIMD +#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK + const std::string cxtref = " [cxtype_ref=YES]"; +#else + const std::string cxtref = " [cxtype_ref=NO]"; +#endif +#endif + // Dump all configuration parameters and all results + std::cout << std::string( SEP79, '*' ) << std::endl +#ifdef __CUDACC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#else + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" +#endif + << " [" << process.getCompiler() << "]" +#ifdef MGONGPU_INLINE_HELAMPS + << " [inlineHel=1]" +#else + << " [inlineHel=0]" +#endif +#ifdef MGONGPU_HARDCODE_PARAM + << " [hardcodePARAM=1]" << std::endl +#else + << " [hardcodePARAM=0]" << std::endl +#endif + << "NumBlocksPerGrid = " << gpublocks << std::endl + << "NumThreadsPerBlock = " << gputhreads << std::endl + << "NumIterations = " << niter << std::endl + << std::string( SEP79, '-' ) << std::endl; + std::cout << "Workflow summary = " << wrkflwtxt << std::endl +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + << "FP precision = MIXED (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl +#elif defined MGONGPU_FPTYPE_DOUBLE + << "FP precision = DOUBLE (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl +#elif defined MGONGPU_FPTYPE_FLOAT + << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl +#endif +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_CUCOMPLEX + << "Complex type = CUCOMPLEX" << std::endl +#elif defined MGONGPU_CUCXTYPE_THRUST + << "Complex type = THRUST::COMPLEX" << std::endl +#endif +#else + << "Complex type = STD::COMPLEX" << std::endl +#endif + << "RanNumb memory layout = AOSOA[" << neppR << "]" + << ( neppR == 1 ? " == AOS" : "" ) + << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl + << "Momenta memory layout = AOSOA[" << neppM << "]" + << ( neppM == 1 ? " == AOS" : "" ) << std::endl +#ifdef __CUDACC__ + //<< "Wavefunction GPU memory = LOCAL" << std::endl +#else +#if !defined MGONGPU_CPPSIMD + << "Internal loops fptype_sv = SCALAR ('none': ~vector[" << neppV + << "], no SIMD)" << std::endl +#elif defined __AVX512VL__ +#ifdef MGONGPU_PVW512 + << "Internal loops fptype_sv = VECTOR[" << neppV + << "] ('512z': AVX512, 512bit)" << cxtref << std::endl +#else + << "Internal loops fptype_sv = VECTOR[" << neppV + << "] ('512y': AVX512, 256bit)" << cxtref << std::endl +#endif +#elif defined __AVX2__ + << "Internal loops fptype_sv = VECTOR[" << neppV + << "] ('avx2': AVX2, 256bit)" << cxtref << std::endl +#elif defined __SSE4_2__ + << "Internal loops fptype_sv = VECTOR[" << neppV +#ifdef __PPC__ + << "] ('sse4': PPC VSX, 128bit)" << cxtref << std::endl +#elif defined __ARM_NEON__ + << "] ('sse4': ARM NEON, 128bit)" << cxtref << std::endl +#else + << "] ('sse4': SSE4.2, 128bit)" << cxtref << std::endl +#endif +#else +#error Internal error: unknown SIMD build configuration +#endif +#endif + << "Random number generation = " << rndgentxt << std::endl +#ifndef __CUDACC__ +#ifdef _OPENMP + << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline +#endif +#endif + //<< "MatrixElements compiler = " << process.getCompiler() << std::endl + << std::string( SEP79, '-' ) << std::endl + << "HelicityComb Good/Tot = " << nGoodHel << "/" << mgOnGpu::ncomb << std::endl + << std::string( SEP79, '-' ) << std::endl + << "NumberOfEntries = " << niter << std::endl + << std::scientific // fixed format: affects all floats (default precision: 6) + << "TotalTime[Rnd+Rmb+ME] (123) = ( " << sumgtim + sumrtim + sumwtim << std::string( 16, ' ' ) << " ) sec" << std::endl + << "TotalTime[Rambo+ME] (23) = ( " << sumrtim + sumwtim << std::string( 16, ' ' ) << " ) sec" << std::endl + << "TotalTime[RndNumGen] (1) = ( " << sumgtim << std::string( 16, ' ' ) << " ) sec" << std::endl + << "TotalTime[Rambo] (2) = ( " << sumrtim << std::string( 16, ' ' ) << " ) sec" << std::endl + << "TotalTime[MatrixElems] (3) = ( " << sumwtim << std::string( 16, ' ' ) << " ) sec" << std::endl + << "MeanTimeInMatrixElems = ( " << meanwtim << std::string( 16, ' ' ) << " ) sec" << std::endl + << "[Min,Max]TimeInMatrixElems = [ " << minwtim + << " , " << maxwtim << " ] sec" << std::endl + //<< "StdDevTimeInMatrixElems = ( " << stdwtim << std::string(16, ' ') << " ) sec" << std::endl + << "TotalTime[MECalcOnly] (3a) = ( " << sumw3atim << std::string( 16, ' ' ) << " ) sec" << std::endl + << "MeanTimeInMECalcOnly = ( " << meanw3atim << std::string( 16, ' ' ) << " ) sec" << std::endl + << "[Min,Max]TimeInMECalcOnly = [ " << minw3atim + << " , " << maxw3atim << " ] sec" << std::endl + //<< "StdDevTimeInMECalcOnly = ( " << stdw3atim << std::string(16, ' ') << " ) sec" << std::endl + << std::string( SEP79, '-' ) << std::endl + //<< "ProcessID: = " << getpid() << std::endl + //<< "NProcesses = " << process.nprocesses << std::endl + << "TotalEventsComputed = " << nevtALL << std::endl + << "EvtsPerSec[Rnd+Rmb+ME](123) = ( " << nevtALL / ( sumgtim + sumrtim + sumwtim ) + << std::string( 16, ' ' ) << " ) sec^-1" << std::endl + << "EvtsPerSec[Rmb+ME] (23) = ( " << nevtALL / ( sumrtim + sumwtim ) + << std::string( 16, ' ' ) << " ) sec^-1" << std::endl + //<< "EvtsPerSec[RndNumGen] (1) = ( " << nevtALL/sumgtim + //<< std::string(16, ' ') << " ) sec^-1" << std::endl + //<< "EvtsPerSec[Rambo] (2) = ( " << nevtALL/sumrtim + //<< std::string(16, ' ') << " ) sec^-1" << std::endl + << "EvtsPerSec[MatrixElems] (3) = ( " << nevtALL / sumwtim + << std::string( 16, ' ' ) << " ) sec^-1" << std::endl + << "EvtsPerSec[MECalcOnly] (3a) = ( " << nevtALL / sumw3atim + << std::string( 16, ' ' ) << " ) sec^-1" << std::endl + << std::defaultfloat; // default format: affects all floats + std::cout << std::string( SEP79, '*' ) << std::endl + << hstStats; + } + + // --- 9b Dump to json + const std::string jsonKey = "9b DumpJson"; + timermap.start( jsonKey ); + + if( json ) + { + std::string jsonFileName = std::to_string( jsondate ) + "-perf-test-run" + std::to_string( jsonrun ) + ".json"; + jsonFileName = "./perf/data/" + jsonFileName; + + //Checks if file exists + std::ifstream fileCheck; + bool fileExists = false; + fileCheck.open( jsonFileName ); + if( fileCheck ) + { + fileExists = true; + fileCheck.close(); + } + + std::ofstream jsonFile; + jsonFile.open( jsonFileName, std::ios_base::app ); + if( !fileExists ) + { + jsonFile << "[" << std::endl; + } + else + { + //deleting the last bracket and outputting a ", " + std::string temp = "truncate -s-1 " + jsonFileName; + const char* command = temp.c_str(); + if( system( command ) != 0 ) + std::cout << "WARNING! Command '" << temp << "' failed" << std::endl; + jsonFile << ", " << std::endl; + } + + jsonFile << "{" << std::endl + << "\"NumIterations\": " << niter << ", " << std::endl + << "\"NumThreadsPerBlock\": " << gputhreads << ", " << std::endl + << "\"NumBlocksPerGrid\": " << gpublocks << ", " << std::endl +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + << "\"FP precision\": " + << "\"MIXED (NaN/abnormal=" << nabn << ")\"," << std::endl +#elif defined MGONGPU_FPTYPE_DOUBLE + << "\"FP precision\": " + << "\"DOUBLE (NaN/abnormal=" << nabn << ")\"," << std::endl +#elif defined MGONGPU_FPTYPE_FLOAT + << "\"FP precision\": " + << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl +#endif + << "\"Complex type\": " +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_CUCOMPLEX + << "\"CUCOMPLEX\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_THRUST + << "\"THRUST::COMPLEX\"," << std::endl +#endif +#else + << "\"STD::COMPLEX\"," << std::endl +#endif + << "\"RanNumb memory layout\": " + << "\"AOSOA[" << neppR << "]\"" + << ( neppR == 1 ? " == AOS" : "" ) << ", " << std::endl + << "\"Momenta memory layout\": " + << "\"AOSOA[" << neppM << "]\"" + << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl +#ifdef __CUDACC__ + //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl +#endif + << "\"Curand generation\": " + << "\"" << rndgentxt << "\"," << std::endl; + + double minelem = hstStats.minME; + double maxelem = hstStats.maxME; + double meanelem = hstStats.meanME(); + double stdelem = hstStats.stdME(); + + jsonFile << "\"NumberOfEntries\": " << niter << "," << std::endl + //<< std::scientific // Not sure about this + << "\"TotalTime[Rnd+Rmb+ME] (123)\": \"" + << std::to_string( sumgtim + sumrtim + sumwtim ) << " sec\"," + << std::endl + << "\"TotalTime[Rambo+ME] (23)\": \"" + << std::to_string( sumrtim + sumwtim ) << " sec\"," << std::endl + << "\"TotalTime[RndNumGen] (1)\": \"" + << std::to_string( sumgtim ) << " sec\"," << std::endl + << "\"TotalTime[Rambo] (2)\": \"" + << std::to_string( sumrtim ) << " sec\"," << std::endl + << "\"TotalTime[MatrixElems] (3)\": \"" + << std::to_string( sumwtim ) << " sec\"," << std::endl + << "\"MeanTimeInMatrixElems\": \"" + << std::to_string( meanwtim ) << " sec\"," << std::endl + << "\"MinTimeInMatrixElems\": \"" + << std::to_string( minwtim ) << " sec\"," << std::endl + << "\"MaxTimeInMatrixElems\": \"" + << std::to_string( maxwtim ) << " sec\"," << std::endl + //<< "ProcessID: = " << getpid() << std::endl + //<< "NProcesses = " << process.nprocesses << std::endl + << "\"TotalEventsComputed\": " << nevtALL << "," << std::endl + << "\"EvtsPerSec[Rnd+Rmb+ME](123)\": \"" + << std::to_string( nevtALL / ( sumgtim + sumrtim + sumwtim ) ) << " sec^-1\"," << std::endl + << "\"EvtsPerSec[Rmb+ME] (23)\": \"" + << std::to_string( nevtALL / ( sumrtim + sumwtim ) ) << " sec^-1\"," << std::endl + << "\"EvtsPerSec[MatrixElems] (3)\": \"" + << std::to_string( nevtALL / sumwtim ) << " sec^-1\"," << std::endl + << "\"EvtsPerSec[MECalcOnly] (3)\": \"" + << std::to_string( nevtALL / sumw3atim ) << " sec^-1\"," << std::endl + << "\"NumMatrixElems(notAbnormal)\": " << nevtALL - nabn << "," << std::endl + << std::scientific + << "\"MeanMatrixElemValue\": " + << "\"" << std::to_string( meanelem ) << " GeV^" + << std::to_string( meGeVexponent ) << "\"," << std::endl + << "\"StdErrMatrixElemValue\": " + << "\"" << std::to_string( stdelem / sqrt( nevtALL ) ) << " GeV^" + << std::to_string( meGeVexponent ) << "\"," << std::endl + << "\"StdDevMatrixElemValue\": " + << "\"" << std::to_string( stdelem ) + << " GeV^" << std::to_string( meGeVexponent ) << "\"," << std::endl + << "\"MinMatrixElemValue\": " + << "\"" << std::to_string( minelem ) << " GeV^" + << std::to_string( meGeVexponent ) << "\"," << std::endl + << "\"MaxMatrixElemValue\": " + << "\"" << std::to_string( maxelem ) << " GeV^" + << std::to_string( meGeVexponent ) << "\"," << std::endl; + + timermap.dump( jsonFile, true ); // NB For the active json timer this dumps a partial total + + jsonFile << "}" << std::endl; + jsonFile << "]"; + jsonFile.close(); + } + + // *** STOP THE NEW TIMERS *** + timermap.stop(); + if( perf ) + { + std::cout << std::string( SEP79, '*' ) << std::endl; + timermap.dump(); + std::cout << std::string( SEP79, '*' ) << std::endl; + } + + // [NB some resources like curand generators will be deleted here when stack-allocated classes go out of scope] + //std::cout << "ALL OK" << std::endl; + return 0; +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/cudacpp.mk new file mode 120000 index 0000000000..252b38e27a --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/cudacpp.mk @@ -0,0 +1 @@ +../cudacpp.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/epoch_process_id.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/epoch_process_id.h new file mode 100644 index 0000000000..fd78e0cce4 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/epoch_process_id.h @@ -0,0 +1,11 @@ +#ifndef EPOCH_PROCESS_ID_H +#define EPOCH_PROCESS_ID_H 1 + +// No need to indicate EPOCHX_ any longer for auto-generated code +// However, keep the name of the file as it may be useful again for new manual developments +#define MG_EPOCH_PROCESS_ID SIGMA_MSSM_SLHA2_GG_TTX + +// For simplicity, define here the name of the process-dependent reference file for tests +#define MG_EPOCH_REFERENCE_FILE_NAME "../../../../../test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt" + +#endif // EPOCH_PROCESS_ID_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.cc new file mode 120000 index 0000000000..cbcc1f579f --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.cc @@ -0,0 +1 @@ +../fbridge.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.inc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.inc new file mode 120000 index 0000000000..69598a6d2f --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fbridge.inc @@ -0,0 +1 @@ +../fbridge.inc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f new file mode 100644 index 0000000000..c0bbf580ef --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f @@ -0,0 +1,84 @@ + PROGRAM FCHECK_SA + IMPLICIT NONE + INCLUDE 'fsampler.inc' + INCLUDE 'fbridge.inc' + INTEGER*8 SAMPLER, BRIDGE ! 64bit memory addresses + INTEGER NEVTMAX, NEXTERNAL, NP4 + PARAMETER(NEVTMAX=2048*256, NEXTERNAL=4, NP4=4) + CHARACTER*32 ARG0, ARG1, ARG2, ARG3 + INTEGER NARG1, NARG2, NARG3 + INTEGER NEVT, NITER + INTEGER IEVT, IITER +c INTEGER IEXTERNAL + DOUBLE PRECISION MOMENTA(0:NP4-1, NEXTERNAL, NEVTMAX) ! c-array momenta[nevt][nexternal][np4] + DOUBLE PRECISION GS(NEVTMAX) + DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used + DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used + INTEGER*4 CHANID + PARAMETER(CHANID=0) ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 + DOUBLE PRECISION MES(NEVTMAX) + INTEGER*4 SELHEL(NEVTMAX) ! not yet used + INTEGER*4 SELCOL(NEVTMAX) ! not yet used + DOUBLE PRECISION MES_SUM ! use REAL*16 for quadruple precision + INTEGER NEVTOK ! exclude nan/abnormal MEs +C +C READ COMMAND LINE ARGUMENTS +C (NB: most errors will crash the program !) +C + IF ( COMMAND_ARGUMENT_COUNT() == 3 ) THEN + CALL GET_COMMAND_ARGUMENT(1,ARG1) + CALL GET_COMMAND_ARGUMENT(2,ARG2) + CALL GET_COMMAND_ARGUMENT(3,ARG3) + READ (ARG1,'(I4)') NARG1 + READ (ARG2,'(I4)') NARG2 + READ (ARG3,'(I4)') NARG3 + WRITE(6,*) "GPUBLOCKS= ", NARG1 + WRITE(6,*) "GPUTHREADS= ", NARG2 + WRITE(6,*) "NITERATIONS=", NARG3 + NEVT = NARG1 * NARG2 + NITER = NARG3 + IF ( NEVT > NEVTMAX ) THEN + WRITE(6,*) "ERROR! NEVT>NEVTMAX" + STOP + ENDIF + ELSE + CALL GET_COMMAND_ARGUMENT(0,ARG0) + WRITE(6,*) "Usage: ", TRIM(ARG0), + & " gpublocks gputhreads niterations" + STOP + ENDIF +C +C USE SAMPLER AND BRIDGE +C + NEVTOK = 0 + MES_SUM = 0 + CALL FBRIDGECREATE(BRIDGE, NEVT, NEXTERNAL, NP4) ! this must be at the beginning as it initialises the CUDA device + CALL FSAMPLERCREATE(SAMPLER, NEVT, NEXTERNAL, NP4) + DO IITER = 1, NITER + CALL FSAMPLERSEQUENCE(SAMPLER, MOMENTA) + DO IEVT = 1, NEVT + GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) + END DO + CALL FBRIDGESEQUENCE(BRIDGE, MOMENTA, GS, + & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL) + DO IEVT = 1, NEVT +c DO IEXTERNAL = 1, NEXTERNAL +c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, +c & MOMENTA(0, IEXTERNAL, IEVT), +c & MOMENTA(1, IEXTERNAL, IEVT), +c & MOMENTA(2, IEXTERNAL, IEVT), +c & MOMENTA(3, IEXTERNAL, IEVT) +c END DO +c WRITE(6,*) 'MES ', IEVT, MES(IEVT) +c WRITE(6,*) + IF ( .NOT. ISNAN(MES(IEVT)) ) THEN + NEVTOK = NEVTOK + 1 + MES_SUM = MES_SUM + MES(IEVT) + ENDIF + END DO + END DO + CALL FSAMPLERDELETE(SAMPLER) + CALL FBRIDGEDELETE(BRIDGE) ! this must be at the end as it shuts down the CUDA device + WRITE(6,*) 'Average Matrix Element:', MES_SUM/NEVT/NITER + WRITE(6,*) 'Abnormal MEs:', NEVT*NITER - NEVTOK + END PROGRAM FCHECK_SA diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fsampler.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fsampler.cc new file mode 120000 index 0000000000..521c828d41 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fsampler.cc @@ -0,0 +1 @@ +../fsampler.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fsampler.inc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fsampler.inc new file mode 120000 index 0000000000..4b0f3c2656 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fsampler.inc @@ -0,0 +1 @@ +../fsampler.inc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gBridgeKernels.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gBridgeKernels.cu new file mode 120000 index 0000000000..12c1d49d13 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gBridgeKernels.cu @@ -0,0 +1 @@ +BridgeKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCPPProcess.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCPPProcess.cu new file mode 120000 index 0000000000..1fc8661d4e --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCPPProcess.cu @@ -0,0 +1 @@ +CPPProcess.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCrossSectionKernels.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCrossSectionKernels.cu new file mode 120000 index 0000000000..9a05a7b55a --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCrossSectionKernels.cu @@ -0,0 +1 @@ +CrossSectionKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gMatrixElementKernels.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gMatrixElementKernels.cu new file mode 120000 index 0000000000..82415576cc --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gMatrixElementKernels.cu @@ -0,0 +1 @@ +MatrixElementKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRamboSamplingKernels.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRamboSamplingKernels.cu new file mode 120000 index 0000000000..8dbfaa6493 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRamboSamplingKernels.cu @@ -0,0 +1 @@ +RamboSamplingKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRandomNumberKernels.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRandomNumberKernels.cu new file mode 120000 index 0000000000..26580cf106 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRandomNumberKernels.cu @@ -0,0 +1 @@ +RandomNumberKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gcheck_sa.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gcheck_sa.cu new file mode 120000 index 0000000000..b99171c25e --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gcheck_sa.cu @@ -0,0 +1 @@ +check_sa.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/makefile b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/makefile new file mode 120000 index 0000000000..cd937e1d9e --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/makefile @@ -0,0 +1 @@ +cudacpp.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/nvtx.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/nvtx.h new file mode 120000 index 0000000000..a2f268fa94 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/nvtx.h @@ -0,0 +1 @@ +../nvtx.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/ompnumthreads.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/ompnumthreads.h new file mode 120000 index 0000000000..4385e53fca --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/ompnumthreads.h @@ -0,0 +1 @@ +../ompnumthreads.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/perf.py b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/perf.py new file mode 120000 index 0000000000..b7d410aefa --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/perf.py @@ -0,0 +1 @@ +../perf.py \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/profile.sh b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/profile.sh new file mode 120000 index 0000000000..01080a084d --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/profile.sh @@ -0,0 +1 @@ +../profile.sh \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/runTest.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/runTest.cc new file mode 120000 index 0000000000..32afd3ca34 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/runTest.cc @@ -0,0 +1 @@ +../runTest.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testmisc.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testmisc.cc new file mode 120000 index 0000000000..3b553cf3f8 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testmisc.cc @@ -0,0 +1 @@ +../testmisc.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testxxx.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testxxx.cc new file mode 120000 index 0000000000..045b2f10ea --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testxxx.cc @@ -0,0 +1 @@ +../testxxx.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testxxx_cc_ref.txt b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testxxx_cc_ref.txt new file mode 120000 index 0000000000..51764d98ac --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/testxxx_cc_ref.txt @@ -0,0 +1 @@ +../testxxx_cc_ref.txt \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/timer.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/timer.h new file mode 120000 index 0000000000..e161ad9e27 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/timer.h @@ -0,0 +1 @@ +../timer.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/timermap.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/timermap.h new file mode 120000 index 0000000000..1479de7fc0 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/timermap.h @@ -0,0 +1 @@ +../timermap.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.cc new file mode 100644 index 0000000000..ed2e042427 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.cc @@ -0,0 +1,178 @@ +#include "RamboSamplingKernels.h" + +#include "CudaRuntime.h" +#include "MemoryAccessMomenta.h" +#include "MemoryAccessRandomNumbers.h" +#include "MemoryAccessWeights.h" +#include "MemoryBuffers.h" +#include "rambo.h" // inline implementation of RAMBO algorithms and kernels + +#include + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + RamboSamplingKernelHost::RamboSamplingKernelHost( const fptype energy, // input: energy + const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] + BufferMomenta& momenta, // output: momenta + BufferWeights& weights, // output: weights + const size_t nevt ) + : SamplingKernelBase( energy, rndmom, momenta, weights ) + , NumberOfEvents( nevt ) + { + if( m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: rndmom must be a host array" ); + if( m_momenta.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: momenta must be a host array" ); + if( m_weights.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelHost: weights must be a host array" ); + if( this->nevt() != m_rndmom.nevt() ) throw std::runtime_error( "RamboSamplingKernelHost: nevt mismatch with rndmom" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "RamboSamplingKernelHost: nevt mismatch with momenta" ); + if( this->nevt() != m_weights.nevt() ) throw std::runtime_error( "RamboSamplingKernelHost: nevt mismatch with weights" ); + // Sanity checks for memory access (momenta buffer) + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( nevt % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "RamboSamplingKernelHost: nevt should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + // Sanity checks for memory access (random number buffer) + constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout + static_assert( ispoweroftwo( neppR ), "neppR is not a power of 2" ); + if( nevt % neppR != 0 ) + { + std::ostringstream sstr; + sstr << "RamboSamplingKernelHost: nevt should be a multiple of neppR=" << neppR; + throw std::runtime_error( sstr.str() ); + } + } + + //-------------------------------------------------------------------------- + + void + RamboSamplingKernelHost::getMomentaInitial() + { + constexpr auto getMomentaInitial = ramboGetMomentaInitial; + // ** START LOOP ON IEVT ** + for( size_t ievt = 0; ievt < nevt(); ++ievt ) + { + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + fptype* ievtMomenta = MemoryAccessMomenta::ieventAccessRecord( m_momenta.data(), ievt ); + getMomentaInitial( m_energy, ievtMomenta ); + } + // ** END LOOP ON IEVT ** + } + + //-------------------------------------------------------------------------- + + void + RamboSamplingKernelHost::getMomentaFinal() + { + constexpr auto getMomentaFinal = ramboGetMomentaFinal; + // ** START LOOP ON IEVT ** + for( size_t ievt = 0; ievt < nevt(); ++ievt ) + { + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + const fptype* ievtRndmom = MemoryAccessRandomNumbers::ieventAccessRecordConst( m_rndmom.data(), ievt ); + fptype* ievtMomenta = MemoryAccessMomenta::ieventAccessRecord( m_momenta.data(), ievt ); + fptype* ievtWeights = MemoryAccessWeights::ieventAccessRecord( m_weights.data(), ievt ); + getMomentaFinal( m_energy, ievtRndmom, ievtMomenta, ievtWeights ); + } + // ** END LOOP ON IEVT ** + } + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy + const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] + BufferMomenta& momenta, // output: momenta + BufferWeights& weights, // output: weights + const size_t gpublocks, + const size_t gputhreads ) + : SamplingKernelBase( energy, rndmom, momenta, weights ) + , NumberOfEvents( gpublocks * gputhreads ) + , m_gpublocks( gpublocks ) + , m_gputhreads( gputhreads ) + { + if( !m_rndmom.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: rndmom must be a device array" ); + if( !m_momenta.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: momenta must be a device array" ); + if( !m_weights.isOnDevice() ) throw std::runtime_error( "RamboSamplingKernelDevice: weights must be a device array" ); + if( m_gpublocks == 0 ) throw std::runtime_error( "RamboSamplingKernelDevice: gpublocks must be > 0" ); + if( m_gputhreads == 0 ) throw std::runtime_error( "RamboSamplingKernelDevice: gputhreads must be > 0" ); + if( this->nevt() != m_rndmom.nevt() ) throw std::runtime_error( "RamboSamplingKernelDevice: nevt mismatch with rndmom" ); + if( this->nevt() != m_momenta.nevt() ) throw std::runtime_error( "RamboSamplingKernelDevice: nevt mismatch with momenta" ); + if( this->nevt() != m_weights.nevt() ) throw std::runtime_error( "RamboSamplingKernelDevice: nevt mismatch with weights" ); + // Sanity checks for memory access (momenta buffer) + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + if( m_gputhreads % neppM != 0 ) + { + std::ostringstream sstr; + sstr << "RamboSamplingKernelHost: gputhreads should be a multiple of neppM=" << neppM; + throw std::runtime_error( sstr.str() ); + } + // Sanity checks for memory access (random number buffer) + constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout + static_assert( ispoweroftwo( neppR ), "neppR is not a power of 2" ); + if( m_gputhreads % neppR != 0 ) + { + std::ostringstream sstr; + sstr << "RamboSamplingKernelDevice: gputhreads should be a multiple of neppR=" << neppR; + throw std::runtime_error( sstr.str() ); + } + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + __global__ void + getMomentaInitialDevice( const fptype energy, + fptype* momenta ) + { + constexpr auto getMomentaInitial = ramboGetMomentaInitial; + return getMomentaInitial( energy, momenta ); + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + void + RamboSamplingKernelDevice::getMomentaInitial() + { + getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + __global__ void + getMomentaFinalDevice( const fptype energy, + const fptype* rndmom, + fptype* momenta, + fptype* wgts ) + { + constexpr auto getMomentaFinal = ramboGetMomentaFinal; + return getMomentaFinal( energy, rndmom, momenta, wgts ); + } +#endif + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + void + RamboSamplingKernelDevice::getMomentaFinal() + { + getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + } +#endif + + //-------------------------------------------------------------------------- +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.h new file mode 100644 index 0000000000..f40433af4a --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.h @@ -0,0 +1,129 @@ +#ifndef RAMBOSAMPLINGKERNELS_H +#define RAMBOSAMPLINGKERNELS_H 1 + +#include "mgOnGpuConfig.h" + +#include "MemoryBuffers.h" + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + // A base class encapsulating phase space sampling on a CPU host or on a GPU device + class SamplingKernelBase //: virtual public ISamplingKernel + { + protected: + + // Constructor from existing input and output buffers + SamplingKernelBase( const fptype energy, // input: energy + const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] + BufferMomenta& momenta, // output: momenta + BufferWeights& weights ) // output: weights + : m_energy( energy ) + , m_rndmom( rndmom ) + , m_momenta( momenta ) + , m_weights( weights ) + { + } + + public: + + // Destructor + virtual ~SamplingKernelBase() {} + + // Get momenta of initial state particles + virtual void getMomentaInitial() = 0; + + // Get momenta of final state particles and weights + virtual void getMomentaFinal() = 0; + + // Is this a host or device kernel? + virtual bool isOnDevice() const = 0; + + protected: + + // The energy + const fptype m_energy; + + // The buffer for the input random numbers + const BufferRndNumMomenta& m_rndmom; + + // The buffer for the output momenta + BufferMomenta& m_momenta; + + // The buffer for the output weights + BufferWeights& m_weights; + }; + + //-------------------------------------------------------------------------- + + // A class encapsulating RAMBO phase space sampling on a CPU host + class RamboSamplingKernelHost final : public SamplingKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + RamboSamplingKernelHost( const fptype energy, // input: energy + const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] + BufferMomenta& momenta, // output: momenta + BufferWeights& weights, // output: weights + const size_t nevt ); + + // Destructor + virtual ~RamboSamplingKernelHost() {} + + // Get momenta of initial state particles + void getMomentaInitial() override final; + + // Get momenta of final state particles and weights + void getMomentaFinal() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + }; + + //-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + // A class encapsulating RAMBO phase space sampling on a GPU device + class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents + { + public: + + // Constructor from existing input and output buffers + RamboSamplingKernelDevice( const fptype energy, // input: energy + const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] + BufferMomenta& momenta, // output: momenta + BufferWeights& weights, // output: weights + const size_t gpublocks, + const size_t gputhreads ); + + // Destructor + virtual ~RamboSamplingKernelDevice() {} + + // Get momenta of initial state particles + void getMomentaInitial() override final; + + // Get momenta of final state particles and weights + void getMomentaFinal() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return true; } + + private: + + // The number of blocks in the GPU grid + size_t m_gpublocks; + + // The number of threads in the GPU grid + size_t m_gputhreads; + }; +#endif + + //-------------------------------------------------------------------------- +} +#endif // RAMBOSAMPLINGKERNELS_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.cc new file mode 100644 index 0000000000..eb8bc09ea9 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.cc @@ -0,0 +1,149 @@ +#include "RandomNumberKernels.h" + +#include "CommonRandomNumbers.h" +#include "CudaRuntime.h" +#include "MemoryBuffers.h" + +#include + +#ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +#define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } +inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != CURAND_STATUS_SUCCESS ) + { + printf( "CurandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == CURAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + CommonRandomNumberKernel::CommonRandomNumberKernel( BufferRndNumMomenta& rnarray ) + : RandomNumberKernelBase( rnarray ) + , m_seed( 20211220 ) + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "CommonRandomNumberKernel on host with a device random number array" ); + } + + //-------------------------------------------------------------------------- + + void CommonRandomNumberKernel::generateRnarray() + { + std::vector rnd = CommonRandomNumbers::generate( m_rnarray.size(), m_seed ); // NB: generate as double (HARDCODED) + std::copy( rnd.begin(), rnd.end(), m_rnarray.data() ); // NB: copy may imply a double-to-float conversion + } + + //-------------------------------------------------------------------------- + +#ifndef MGONGPU_HAS_NO_CURAND + CurandRandomNumberKernel::CurandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef __CUDACC__ + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "CurandRandomNumberKernel does not support CurandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "CurandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + CurandRandomNumberKernel::~CurandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void CurandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkCurand( curandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void CurandRandomNumberKernel::createGenerator() + { + // [NB Timings are for GenRnGen host|device (cpp|cuda) generation of 256*32*1 events with nproc=1: rn(0) is host=0.0012s] + const curandRngType_t type = CURAND_RNG_PSEUDO_MTGP32; // 0.00082s | 0.00064s (FOR FAST TESTS) + //const curandRngType_t type = CURAND_RNG_PSEUDO_XORWOW; // 0.049s | 0.0016s + //const curandRngType_t type = CURAND_RNG_PSEUDO_MRG32K3A; // 0.71s | 0.0012s (better but slower, especially in c++) + //const curandRngType_t type = CURAND_RNG_PSEUDO_MT19937; // 21s | 0.021s + //const curandRngType_t type = CURAND_RNG_PSEUDO_PHILOX4_32_10; // 0.024s | 0.00026s (used to segfault?) + if( m_isOnDevice ) + { + checkCurand( curandCreateGenerator( &m_rnGen, type ) ); + } + else + { + checkCurand( curandCreateGeneratorHost( &m_rnGen, type ) ); + } + //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_LEGACY ) ); // fails with code=104 (see #429) + checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_BEST ) ); + //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_DYNAMIC ) ); // fails with code=104 (see #429) + //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_SEEDED ) ); // fails with code=104 (see #429) + } + + //-------------------------------------------------------------------------- + + void CurandRandomNumberKernel::destroyGenerator() + { + checkCurand( curandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void CurandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkCurand( curandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkCurand( curandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef __CUDACC__ + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef __CUDACC__ + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.h new file mode 100644 index 0000000000..4d55f3d449 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.h @@ -0,0 +1,146 @@ +#ifndef RANDOMNUMBERKERNELS_H +#define RANDOMNUMBERKERNELS_H 1 + +#include "mgOnGpuConfig.h" + +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +#ifndef MGONGPU_HAS_NO_CURAND +#include "curand.h" +#endif + +#include "MemoryBuffers.h" + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + /* + // An interface encapsulating random number generation on a CPU host or on a GPU device + class IRandomNumberKernel + { + public: + + // Destructor + virtual ~IRandomNumberKernel(){} + + // Seed the random number generator + virtual void seedGenerator( const unsigned int seed ) = 0; + + // Generate the random number array + virtual void generateRnarray() = 0; + + // Is this a host or device kernel? + virtual bool isOnDevice() const = 0; + + }; + */ + + //-------------------------------------------------------------------------- + + // A base class encapsulating random number generation on a CPU host or on a GPU device + class RandomNumberKernelBase //: virtual public IRandomNumberKernel + { + + protected: + + // Constructor from an existing output buffer + RandomNumberKernelBase( BufferRndNumMomenta& rnarray ) + : m_rnarray( rnarray ) {} + + public: + + // Destructor + virtual ~RandomNumberKernelBase() {} + + // Seed the random number generator + virtual void seedGenerator( const unsigned int seed ) = 0; + + // Generate the random number array + virtual void generateRnarray() = 0; + + // Is this a host or device kernel? + virtual bool isOnDevice() const = 0; + + protected: + + // The buffer for the output random numbers + BufferRndNumMomenta& m_rnarray; + }; + + //-------------------------------------------------------------------------- + + // A class encapsulating common random number generation on a CPU host + class CommonRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + CommonRandomNumberKernel( BufferRndNumMomenta& rnarray ); + + // Destructor + ~CommonRandomNumberKernel() {} + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final { m_seed = seed; }; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return false; } + + private: + + // The generator seed + unsigned int m_seed; + }; + + //-------------------------------------------------------------------------- + +#ifndef MGONGPU_HAS_NO_CURAND + // A class encapsulating CURAND random number generation on a CPU host or on a GPU device + class CurandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + CurandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~CurandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The curand generator + curandGenerator_t m_rnGen; + }; + +#endif + + //-------------------------------------------------------------------------- +} +#endif // RANDOMNUMBERKERNELS_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk new file mode 100644 index 0000000000..2155495366 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -0,0 +1,798 @@ +#=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) +#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories + +CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)) +CUDACPP_SRC_MAKEFILE = cudacpp_src.mk + +#------------------------------------------------------------------------------- + +#=== Use bash in the Makefile (https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html) + +SHELL := /bin/bash + +#------------------------------------------------------------------------------- + +#=== Detect O/S and architecture (assuming uname is available, https://en.wikipedia.org/wiki/Uname) + +# Detect O/S kernel (Linux, Darwin...) +UNAME_S := $(shell uname -s) +###$(info UNAME_S='$(UNAME_S)') + +# Detect architecture (x86_64, ppc64le...) +UNAME_P := $(shell uname -p) +###$(info UNAME_P='$(UNAME_P)') + +#------------------------------------------------------------------------------- + +#=== Configure common compiler flags for C++ and CUDA + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +# Dependency on src directory +MG5AMC_COMMONLIB = mg5amc_common +LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +INCFLAGS += -I../../src + +# Dependency on tools directory +TOOLSDIR = ../../../../../tools +INCFLAGS += -I$(TOOLSDIR) + +# Dependency on test directory +TESTDIR = ../../../../../test +GTESTLIBDIR = $(TESTDIR)/googletest/build/lib/ +GTESTLIBS = $(GTESTLIBDIR)/libgtest.a $(GTESTLIBDIR)/libgtest_main.a + +#------------------------------------------------------------------------------- + +#=== Configure the C++ compiler + +CXXFLAGS = $(OPTFLAGS) -std=c++17 $(INCFLAGS) $(USE_NVTX) -Wall -Wshadow -Wextra +ifeq ($(shell $(CXX) --version | grep ^nvc++),) +CXXFLAGS+= -ffast-math # see issue #117 +endif +###CXXFLAGS+= -Ofast # performance is not different from --fast-math +###CXXFLAGS+= -g # FOR DEBUGGING ONLY + +# Optionally add debug flags to display the full list of flags (eg on Darwin) +###CXXFLAGS+= -v + +# Note: AR, CXX and FC are implicitly defined if not set externally +# See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html + +#------------------------------------------------------------------------------- + +#=== Configure the CUDA compiler + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) +# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + override CUDA_HOME=disabled +endif + +# If CUDA_HOME is not set, try to set it from the location of nvcc +ifndef CUDA_HOME + CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") +endif + +# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) + NVCC = $(CUDA_HOME)/bin/nvcc + USE_NVTX ?=-DUSE_NVTX + # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html + # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). + # Embed device code for 70, and PTX for 70+. + # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533). + # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). + MADGRAPH_CUDA_ARCHITECTURE ?= 70 + ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + comma:=, + CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + CUINC = -I$(CUDA_HOME)/include/ + CULIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! + CUOPTFLAGS = -lineinfo + CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) + ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) +else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) + $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) +else + # No cuda. Switch cuda compilation off and go to common random numbers in C++ + $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) + override NVCC= + override USE_NVTX= + override CULIBFLAGS= +endif + +# Set the host C++ compiler for nvcc via "-ccbin " +# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) +CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + +# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) +ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) +CUFLAGS += -allow-unsupported-compiler +endif + +#------------------------------------------------------------------------------- + +#=== Configure ccache for C++ and CUDA builds + +# Enable ccache if USECCACHE=1 +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX:=ccache $(CXX) +endif +#ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) +# override AR:=ccache $(AR) +#endif +ifneq ($(NVCC),) + ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) + override NVCC:=ccache $(NVCC) + endif +endif + +#------------------------------------------------------------------------------- + +#=== Configure PowerPC-specific compiler flags for C++ and CUDA + +# PowerPC-specific CXX compiler flags (being reviewed) +ifeq ($(UNAME_P),ppc64le) + CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4 + # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6 + ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change + ###CXXFLAGS+= -fpeel-loops # no change + ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4 + ###CXXFLAGS+= -ftree-vectorize # no change + ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6! +else + ###CXXFLAGS+= -flto # also on Intel this would increase throughputs by a factor 2 to 4... + ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) +endif + +# PowerPC-specific CUDA compiler flags (to be reviewed!) +ifeq ($(UNAME_P),ppc64le) + CUFLAGS+= -Xcompiler -mno-float128 +endif + +#------------------------------------------------------------------------------- + +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN + +# Set the default OMPFLAGS choice +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +override OMPFLAGS = -fopenmp +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) +override OMPFLAGS = -fopenmp +###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) +else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) +override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578) +else +override OMPFLAGS = -fopenmp +###override OMPFLAGS = # disable OpenMP MT (default before #575) +endif + +# Set the default AVX (vectorization) choice +ifeq ($(AVX),) + ifeq ($(UNAME_P),ppc64le) + ###override AVX = none + override AVX = sse4 + else ifeq ($(UNAME_P),arm) + ###override AVX = none + override AVX = sse4 + else ifeq ($(wildcard /proc/cpuinfo),) + override AVX = none + $(warning Using AVX='$(AVX)' because host SIMD features cannot be read from /proc/cpuinfo) + else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo)$(shell $(CXX) --version | grep ^clang),1) + override AVX = 512y + ###$(info Using AVX='$(AVX)' as no user input exists) + else + override AVX = avx2 + ifneq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1) + $(warning Using AVX='$(AVX)' because host does not support avx512vl) + else + $(warning Using AVX='$(AVX)' because this is faster than avx512vl for clang) + endif + endif +else + ###$(info Using AVX='$(AVX)' according to user input) +endif + +# Set the default FPTYPE (floating point type) choice +ifeq ($(FPTYPE),) + override FPTYPE = d +endif + +# Set the default HELINL (inline helicities?) choice +ifeq ($(HELINL),) + override HELINL = 0 +endif + +# Set the default HRDCOD (hardcode cIPD physics parameters?) choice +ifeq ($(HRDCOD),) + override HRDCOD = 0 +endif + +# Set the default RNDGEN (random number generator) choice +ifeq ($(NVCC),) + override RNDGEN = hasNoCurand +else ifeq ($(RNDGEN),) + override RNDGEN = hasCurand +endif + +# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +export AVX +export FPTYPE +export HELINL +export HRDCOD +export RNDGEN +export OMPFLAGS + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN + +# Set the build flags appropriate to OMPFLAGS +$(info OMPFLAGS=$(OMPFLAGS)) +CXXFLAGS += $(OMPFLAGS) + +# Set the build flags appropriate to each AVX choice (example: "make AVX=none") +# [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] +# [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] +$(info AVX=$(AVX)) +ifeq ($(UNAME_P),ppc64le) + ifeq ($(AVX),sse4) + override AVXFLAGS = -D__SSE4_2__ # Power9 VSX with 128 width (VSR registers) + else ifneq ($(AVX),none) + $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on PowerPC for the moment) + endif +else ifeq ($(UNAME_P),arm) + ifeq ($(AVX),sse4) + override AVXFLAGS = -D__SSE4_2__ # ARM NEON with 128 width (Q/quadword registers) + else ifneq ($(AVX),none) + $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on ARM for the moment) + endif +else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 + ifeq ($(AVX),none) + override AVXFLAGS = -mno-sse3 # no SIMD + else ifeq ($(AVX),sse4) + override AVXFLAGS = -mno-avx # SSE4.2 with 128 width (xmm registers) + else ifeq ($(AVX),avx2) + override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang] + else ifeq ($(AVX),512y) + override AVXFLAGS = -march=skylake -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] + else ifeq ($(AVX),512z) + override AVXFLAGS = -march=skylake -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else + $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported) + endif +else + ifeq ($(AVX),none) + override AVXFLAGS = -march=x86-64 # no SIMD (see #588) + else ifeq ($(AVX),sse4) + override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers) + else ifeq ($(AVX),avx2) + override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang] + else ifeq ($(AVX),512y) + override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] + else ifeq ($(AVX),512z) + override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else + $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported) + endif +endif +# For the moment, use AVXFLAGS everywhere: eventually, use them only in encapsulated implementations? +CXXFLAGS+= $(AVXFLAGS) + +# Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") +$(info FPTYPE=$(FPTYPE)) +ifeq ($(FPTYPE),d) + CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE +else ifeq ($(FPTYPE),f) + CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT +else ifeq ($(FPTYPE),m) + CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT +else + $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) +endif + +# Set the build flags appropriate to each HELINL choice (example: "make HELINL=1") +$(info HELINL=$(HELINL)) +ifeq ($(HELINL),1) + CXXFLAGS += -DMGONGPU_INLINE_HELAMPS + CUFLAGS += -DMGONGPU_INLINE_HELAMPS +else ifneq ($(HELINL),0) + $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) +endif + +# Set the build flags appropriate to each HRDCOD choice (example: "make HRDCOD=1") +$(info HRDCOD=$(HRDCOD)) +ifeq ($(HRDCOD),1) + CXXFLAGS += -DMGONGPU_HARDCODE_PARAM + CUFLAGS += -DMGONGPU_HARDCODE_PARAM +else ifneq ($(HRDCOD),0) + $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) +endif + +# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") +$(info RNDGEN=$(RNDGEN)) +ifeq ($(RNDGEN),hasNoCurand) + CXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifneq ($(RNDGEN),hasCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +#------------------------------------------------------------------------------- + +#=== Configure build directories and build lockfiles === + +# Build directory "short" tag (defines target and path to the optional build directory) +# (Rationale: keep directory names shorter, e.g. do not include random number generator choice) +override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) + +# Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) +# (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) + +# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 +ifeq ($(USEBUILDDIR),1) + override BUILDDIR = build.$(DIRTAG) + override LIBDIR = ../../lib/$(BUILDDIR) + override LIBDIRRPATH = '$$ORIGIN/../$(LIBDIR)' + $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is set = 1)) +else + override BUILDDIR = . + override LIBDIR = ../../lib + override LIBDIRRPATH = '$$ORIGIN/$(LIBDIR)' + $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is not set)) +endif +###override INCDIR = ../../include +###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG)) + +# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH +# Use relative paths with respect to the executables or shared libraries ($ORIGIN on Linux) +# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary +ifeq ($(UNAME_S),Darwin) + override CXXLIBFLAGSRPATH = + override CULIBFLAGSRPATH = + override CXXLIBFLAGSRPATH2 = + override CULIBFLAGSRPATH2 = +else + # RPATH to cuda/cpp libs when linking executables + override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + # RPATH to common lib when linking cuda/cpp libs + override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' +endif + +# Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) +override RUNTIME = + +#=============================================================================== +#=== Makefile TARGETS and build rules below +#=============================================================================== + +cxx_main=$(BUILDDIR)/check.exe +fcxx_main=$(BUILDDIR)/fcheck.exe + +ifneq ($(NVCC),) +cu_main=$(BUILDDIR)/gcheck.exe +fcu_main=$(BUILDDIR)/fgcheck.exe +else +cu_main= +fcu_main= +endif + +testmain=$(BUILDDIR)/runTest.exe + +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_main) $(cxx_main) $(testmain) $(fcu_main) $(fcxx_main) + +# Target (and build options): debug +MAKEDEBUG= +debug: OPTFLAGS = -g -O0 -DDEBUG2 +debug: CUOPTFLAGS = -G +debug: MAKEDEBUG := debug +debug: all.$(TAG) + +# Target: tag-specific build lockfiles +override oldtagsb=`if [ -d $(BUILDDIR) ]; then find $(BUILDDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi` +$(BUILDDIR)/.build.$(TAG): + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi + @touch $(BUILDDIR)/.build.$(TAG) + +# Generic target and build rules: objects from CUDA compilation +ifneq ($(NVCC),) +$(BUILDDIR)/%.o : %.cu *.h ../../src/*.h + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + +$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ +endif + +# Generic target and build rules: objects from C++ compilation +$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CUINC) -fPIC -c $< -o $@ + +# Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117) +ifeq ($(shell $(CXX) --version | grep ^nvc++),) +$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math +ifneq ($(NVCC),) +$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +endif +endif + +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) +ifneq ($(NVCC),) +CUFLAGS += -Xcompiler -Wno-deprecated-builtins +endif +endif + +# Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) +# This patch does remove the warning, but I prefer to keep it disabled for the moment... +###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) +###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option +###ifneq ($(NVCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###endif +###endif + +#### Apply special build flags only to CPPProcess.cc (-flto) +###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto + +#### Apply special build flags only to CPPProcess.cc (AVXFLAGS) +###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) + +#------------------------------------------------------------------------------- + +# Target (and build rules): common (src) library +commonlib : $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so + +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so: ../../src/*.h ../../src/*.cc + $(MAKE) -C ../../src $(MAKEDEBUG) -f $(CUDACPP_SRC_MAKEFILE) + +#------------------------------------------------------------------------------- + +processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') +###$(info processid_short=$(processid_short)) + +MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp +cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o +cxx_objects_exe=$(BUILDDIR)/RandomNumberKernels.o $(BUILDDIR)/RamboSamplingKernels.o + +ifneq ($(NVCC),) +MG5AMC_CULIB = mg5amc_$(processid_short)_cuda +cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o +cu_objects_exe=$(BUILDDIR)/gRandomNumberKernels.o $(BUILDDIR)/gRamboSamplingKernels.o +endif + +# Target (and build rules): C++ and CUDA shared libraries +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o +$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) + $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + +ifneq ($(NVCC),) +$(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o +$(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o +$(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) + $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +endif + +#------------------------------------------------------------------------------- + +# Target (and build rules): Fortran include files +###$(INCDIR)/%.inc : ../%.inc +### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi +### \cp $< $@ + +#------------------------------------------------------------------------------- + +# Target (and build rules): C++ and CUDA standalone executables +$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + +ifneq ($(NVCC),) +ifneq ($(shell $(CXX) --version | grep ^Intel),) +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 +$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +endif +$(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) + $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(CULIBFLAGS) +endif + +#------------------------------------------------------------------------------- + +# Generic target and build rules: objects from Fortran compilation +$(BUILDDIR)/%.o : %.f *.inc + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(FC) -I. -c $< -o $@ + +# Generic target and build rules: objects from Fortran compilation +###$(BUILDDIR)/%.o : %.f *.inc +### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi +### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi +### $(FC) -I. -I$(INCDIR) -c $< -o $@ + +# Target (and build rules): Fortran standalone executables +###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc + +ifeq ($(UNAME_S),Darwin) +$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +endif +$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) + +ifneq ($(NVCC),) +ifneq ($(shell $(CXX) --version | grep ^Intel),) +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +endif +ifeq ($(UNAME_S),Darwin) +$(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 +endif +$(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) + $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(CULIBFLAGS) +endif + +#------------------------------------------------------------------------------- + +# Target (and build rules): test objects and test executable +$(BUILDDIR)/testxxx.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt +$(testmain): $(BUILDDIR)/testxxx.o +$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions + +ifneq ($(NVCC),) +$(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) +$(BUILDDIR)/testxxx_cu.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt +$(testmain): $(BUILDDIR)/testxxx_cu.o +$(testmain): cu_objects_exe += $(BUILDDIR)/testxxx_cu.o # Comment out this line to skip the CUDA test of xxx functions +endif + +$(BUILDDIR)/testmisc.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(testmain): $(BUILDDIR)/testmisc.o +$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests + +ifneq ($(NVCC),) +$(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) +$(BUILDDIR)/testmisc_cu.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(testmain): $(BUILDDIR)/testmisc_cu.o +$(testmain): cu_objects_exe += $(BUILDDIR)/testmisc_cu.o # Comment out this line to skip the CUDA miscellaneous tests +endif + +$(BUILDDIR)/runTest.o: $(GTESTLIBS) +$(BUILDDIR)/runTest.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(testmain): $(BUILDDIR)/runTest.o +$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o + +ifneq ($(NVCC),) +$(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) +$(BUILDDIR)/runTest_cu.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +ifneq ($(shell $(CXX) --version | grep ^Intel),) +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 +$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +endif +$(testmain): $(BUILDDIR)/runTest_cu.o +$(testmain): cu_objects_exe += $(BUILDDIR)/runTest_cu.o +endif + +$(testmain): $(GTESTLIBS) +$(testmain): INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -lgtest_main + +ifneq ($(OMPFLAGS),) +ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 +###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) +###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) +else +$(testmain): LIBFLAGS += -lgomp +endif +endif + +ifeq ($(NVCC),) # link only runTest.o +$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) +else # link both runTest.o and runTest_cu.o +$(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH +$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) +endif + +# Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 +$(GTESTLIBS): +ifneq ($(shell which flock 2>/dev/null),) + flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR) +else + $(MAKE) -C $(TESTDIR) +endif + +#------------------------------------------------------------------------------- + +# Target: build all targets in all AVX modes (each AVX mode in a separate build directory) +# Split the avxall target into five separate targets to allow parallel 'make -j avxall' builds +# (Hack: add a fbridge.inc dependency to avxall, to ensure it is only copied once for all AVX modes) +avxnone: + @echo + $(MAKE) USEBUILDDIR=1 AVX=none -f $(CUDACPP_MAKEFILE) + +avxsse4: + @echo + $(MAKE) USEBUILDDIR=1 AVX=sse4 -f $(CUDACPP_MAKEFILE) + +avxavx2: + @echo + $(MAKE) USEBUILDDIR=1 AVX=avx2 -f $(CUDACPP_MAKEFILE) + +avx512y: + @echo + $(MAKE) USEBUILDDIR=1 AVX=512y -f $(CUDACPP_MAKEFILE) + +avx512z: + @echo + $(MAKE) USEBUILDDIR=1 AVX=512z -f $(CUDACPP_MAKEFILE) + +ifeq ($(UNAME_P),ppc64le) +###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4 +avxall: avxnone avxsse4 +else ifeq ($(UNAME_P),arm) +###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4 +avxall: avxnone avxsse4 +else +###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4 avxavx2 avx512y avx512z +avxall: avxnone avxsse4 avxavx2 avx512y avx512z +endif + +#------------------------------------------------------------------------------- + +# Target: clean the builds +.PHONY: clean + +clean: +ifeq ($(USEBUILDDIR),1) + rm -rf $(BUILDDIR) +else + rm -f $(BUILDDIR)/.build.* $(BUILDDIR)/*.o $(BUILDDIR)/*.exe + rm -f $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(LIBDIR)/lib$(MG5AMC_CULIB).so +endif + $(MAKE) -C ../../src clean -f $(CUDACPP_SRC_MAKEFILE) +### rm -rf $(INCDIR) + +cleanall: + @echo + $(MAKE) USEBUILDDIR=0 clean -f $(CUDACPP_MAKEFILE) + @echo + $(MAKE) USEBUILDDIR=0 -C ../../src cleanall -f $(CUDACPP_SRC_MAKEFILE) + rm -rf build.* + +# Target: clean the builds as well as the googletest installation +distclean: cleanall + $(MAKE) -C $(TESTDIR) clean + +#------------------------------------------------------------------------------- + +# Target: show system and compiler information +info: + @echo "" + @uname -spn # e.g. Linux nodename.cern.ch x86_64 +ifeq ($(UNAME_S),Darwin) + @sysctl -a | grep -i brand + @sysctl -a | grep machdep.cpu | grep features || true + @sysctl -a | grep hw.physicalcpu: + @sysctl -a | grep hw.logicalcpu: +else + @cat /proc/cpuinfo | grep "model name" | sort -u + @cat /proc/cpuinfo | grep "flags" | sort -u + @cat /proc/cpuinfo | grep "cpu cores" | sort -u + @cat /proc/cpuinfo | grep "physical id" | sort -u +endif + @echo "" +ifneq ($(shell which nvidia-smi 2>/dev/null),) + nvidia-smi -L + @echo "" +endif + @echo USECCACHE=$(USECCACHE) +ifeq ($(USECCACHE),1) + ccache --version | head -1 +endif + @echo "" + @echo NVCC=$(NVCC) +ifneq ($(NVCC),) + $(NVCC) --version +endif + @echo "" + @echo CXX=$(CXX) +ifneq ($(shell $(CXX) --version | grep ^clang),) + @echo $(CXX) -v + @$(CXX) -v |& egrep -v '(Found|multilib)' + @readelf -p .comment `$(CXX) -print-libgcc-file-name` |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print "GCC toolchain:",$$5}' +else + $(CXX) --version +endif + @echo "" + @echo FC=$(FC) + $(FC) --version + +#------------------------------------------------------------------------------- + +# Target: check (run the C++ test executable) +# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] +ifneq ($(NVCC),) +check: runTest cmpFcheck cmpFGcheck +else +check: runTest cmpFcheck +endif + +# Target: runTest (run the C++ test executable runTest.exe) +runTest: all.$(TAG) + $(RUNTIME) $(BUILDDIR)/runTest.exe + +# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) +runCheck: all.$(TAG) + $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 + +# Target: runGcheck (run the CUDA standalone executable gcheck.exe, with a small number of events) +runGcheck: all.$(TAG) + $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 + +# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) +runFcheck: all.$(TAG) + $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 + +# Target: runFGcheck (run the Fortran standalone executable - with CUDA MEs - fgcheck.exe, with a small number of events) +runFGcheck: all.$(TAG) + $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 + +# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) +cmpFcheck: all.$(TAG) + @echo + @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fcheck.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi + +# Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events) +cmpFGcheck: all.$(TAG) + @echo + @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" + @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" + @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA) = $${me1}"; echo "Avg ME (F77/CUDA) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi + +# Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) +memcheck: all.$(TAG) + $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 + +#------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc new file mode 100644 index 0000000000..9c9287e0c5 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc @@ -0,0 +1,126 @@ +#include "Bridge.h" +#include "CPPProcess.h" +#include "CudaRuntime.h" + +extern "C" +{ + /** + * The namespace where the Bridge class is taken from. + * + * In the current implementation, two separate shared libraries are created for the GPU/CUDA and CPU/C++ implementations. + * Actually, two shared libraries for GPU and CPU are created for each of the five SIMD implementations on CPUs (none, sse4, avx2, 512y, 512z). + * A single fcreatebridge_ symbol is created in each library with the same name, connected to the appropriate Bridge on CPU or GPU. + * The Fortran MadEvent code is always the same: the choice whether to use a CPU or GPU implementation is done by linking the appropriate library. + * As the names of the two CPU/GPU libraries are the same in the five SIMD implementations, the choice of SIMD is done by setting LD_LIBRARY_PATH. + * + * In a future implementation, a single heterogeneous shared library may be created, with the same interface. + * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. + * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. + */ +#ifdef __CUDACC__ + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + /** + * The floating point precision used in Fortran arrays. + * This is presently hardcoded to double precision (REAL*8). + */ + using FORTRANFPTYPE = double; // for Fortran double precision (REAL*8) arrays + //using FORTRANFPTYPE = float; // for Fortran single precision (REAL*4) arrays + + /** + * Create a Bridge and return its pointer. + * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f). + * + * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable) + * @param nevtF the pointer to the number of events in the Fortran arrays + * @param nparF the pointer to the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F the pointer to the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + */ + void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) + { +#ifdef __CUDACC__ + CudaRuntime::setUp(); +#endif + // Create a process object, read parm card and set parameters + // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + CPPProcess process( /*verbose=*/false ); + process.initProc( "../../Cards/param_card.dat" ); + // FIXME: disable OMP in Bridge when called from Fortran + *ppbridge = new Bridge( *pnevtF, *pnparF, *pnp4F ); + } + + /** + * Delete a Bridge. + * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f). + * + * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable) + */ + void fbridgedelete_( CppObjectInFortran** ppbridge ) + { + Bridge* pbridge = dynamic_cast*>( *ppbridge ); + if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); + delete pbridge; +#ifdef __CUDACC__ + CudaRuntime::tearDown(); +#endif + } + + /** + * Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++. + * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f). + * + * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable) + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant alphas) + * @param rndhel the pointer to the input random numbers for helicity selection + * @param rndcol the pointer to the input random numbers for color selection + * @param channelId the pointer to the input Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0) + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + */ + void fbridgesequence_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + const unsigned int* pchannelId, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol ) + { + Bridge* pbridge = dynamic_cast*>( *ppbridge ); + if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); +#ifdef __CUDACC__ + // Use the device/GPU implementation in the CUDA library + // (there is also a host implementation in this library) + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); +#else + // Use the host/CPU implementation in the C++ library + // (there is no device implementation in this library) + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); +#endif + } + + /** + * Retrieve the number of good helicities for helicity filtering in the Bridge. + * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f). + * + * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable) + * @param pngoodhel the pointer to the output number of good helicities + * @param pntothel the pointer to the output total number of helicities + */ + void fbridgegetngoodhel_( CppObjectInFortran** ppbridge, + unsigned int* pngoodhel, + unsigned int* pntothel ) + { + Bridge* pbridge = dynamic_cast*>( *ppbridge ); + if( pbridge == 0 ) throw std::runtime_error( "fbridgegetngoodhel_: invalid Bridge address" ); + *pngoodhel = pbridge->nGoodHel(); + *pntothel = pbridge->nTotHel(); + } +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc new file mode 100644 index 0000000000..f140b660fc --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc @@ -0,0 +1,66 @@ +C +C Create a Bridge and return its pointer +C - PBRIDGE: the memory address of the C++ Bridge +C - NEVT: the number of events in the Fortran arrays +C - NPAR: the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?) +C - NP4: the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?) +C + INTERFACE + SUBROUTINE FBRIDGECREATE(PBRIDGE, NEVT, NPAR, NP4) + INTEGER*8 PBRIDGE + INTEGER*4 NEVT + INTEGER*4 NPAR + INTEGER*4 NP4 + END SUBROUTINE FBRIDGECREATE + END INTERFACE + +C +C Delete a Bridge. +C - PBRIDGE: the memory address of the C++ Bridge +C + INTERFACE + SUBROUTINE FBRIDGEDELETE(PBRIDGE) + INTEGER*8 PBRIDGE + END SUBROUTINE FBRIDGEDELETE + END INTERFACE + +C +C Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++. +C - PBRIDGE: the memory address of the C++ Bridge +C - MOMENTA: the input 4-momenta Fortran array +C - GS: the input Gs (running QCD coupling constant alphas) Fortran array +C - RNDHEL: the input random number Fortran array for helicity selection +C - RNDCOL: the input random number Fortran array for color selection +C - CHANID: the input Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0) +C - MES: the output matrix element Fortran array +C - SELHEL: the output selected helicity Fortran array +C - SELCOL: the output selected color Fortran array +C + INTERFACE + SUBROUTINE FBRIDGESEQUENCE(PBRIDGE, MOMENTA, GS, + & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL) + INTEGER*8 PBRIDGE + DOUBLE PRECISION MOMENTA(*) + DOUBLE PRECISION GS(*) + DOUBLE PRECISION RNDHEL(*) + DOUBLE PRECISION RNDCOL(*) + INTEGER*4 CHANID + DOUBLE PRECISION MES(*) + INTEGER*4 SELHEL(*) + INTEGER*4 SELCOL(*) + END SUBROUTINE FBRIDGESEQUENCE + END INTERFACE + +C +C Retrieve the number of good helicities for helicity filtering in the Bridge. +C - PBRIDGE: the memory address of the C++ Bridge +C - NGOODHEL: the output number of good helicities +C - NTOTHEL: the output total number of helicities in cudacpp (aka NCOMB in Fortran) +C + INTERFACE + SUBROUTINE FBRIDGEGETNGOODHEL(PBRIDGE, NGOODHEL, NTOTHEL) + INTEGER*8 PBRIDGE + INTEGER*4 NGOODHEL + INTEGER*4 NTOTHEL + END SUBROUTINE FBRIDGEGETNGOODHEL + END INTERFACE diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.cc new file mode 100644 index 0000000000..bc90937f47 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.cc @@ -0,0 +1,159 @@ +#include "mgOnGpuConfig.h" + +#include "Bridge.h" +#include "MemoryBuffers.h" +#include "RamboSamplingKernels.h" +#include "RandomNumberKernels.h" + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + template + class Sampler final : public CppObjectInFortran + { + public: + // Constructor + // @param nevtF (VECSIZE_USED, vector.inc) number of events in Fortran arrays + // @param nparF (NEXTERNAL, nexternal.inc) number of external particles in Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?) + // @param np4F number of momenta components, usually 4, in Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?) + Sampler( int nevtF, int nparF, int np4F ); + // Destructor + virtual ~Sampler() {} + // Delete copy/move constructors and assignment operators + Sampler( const Sampler& ) = delete; + Sampler( Sampler&& ) = delete; + Sampler& operator=( const Sampler& ) = delete; + Sampler& operator=( Sampler&& ) = delete; + // Draw random numbers and convert them to momenta in C++, then transpose them to Fortran momenta + void samplerHostSequence( FORTRANFPTYPE* fortranMomenta ); + private: + const int m_nevt; // The number of events in each iteration + int m_iiter; // The iteration counter (for random number seeding) +#ifndef __CUDACC__ + HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers + HostBufferMomenta m_hstMomenta; // Memory buffers for momenta + HostBufferWeights m_hstWeights; // Memory buffers for sampling weights +#else + PinnedHostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers + PinnedHostBufferMomenta m_hstMomenta; // Memory buffers for momenta + PinnedHostBufferWeights m_hstWeights; // Memory buffers for sampling weights +#endif + std::unique_ptr m_prnk; // The appropriate RandomNumberKernel + std::unique_ptr m_prsk; // The appropriate SamplingKernel + // HARDCODED DEFAULTS + static constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + }; + + template + Sampler::Sampler( int nevtF, int nparF, int np4F ) + : m_nevt( nevtF ) + , m_iiter( 0 ) + , m_hstRndmom( nevtF ) + , m_hstMomenta( nevtF ) + , m_hstWeights( nevtF ) + , m_prnk( new CommonRandomNumberKernel( m_hstRndmom ) ) + , m_prsk( new RamboSamplingKernelHost( energy, m_hstRndmom, m_hstMomenta, m_hstWeights, nevtF ) ) + { + if( nparF != mgOnGpu::npar ) throw std::runtime_error( "Sampler constructor: npar mismatch" ); + if( np4F != mgOnGpu::np4 ) throw std::runtime_error( "Sampler constructor: np4 mismatch" ); + std::cout << "WARNING! Instantiate host Sampler (nevt=" << m_nevt << ")" << std::endl; + } + + // Draw random numbers and convert them to momenta in C++, then transpose them to Fortran momenta + template + void Sampler::samplerHostSequence( FORTRANFPTYPE* fortranMomenta ) + { + std::cout << "Iteration #" << m_iiter + 1 << std::endl; + // === STEP 1 OF 3 + // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // [NB This should not be necessary using the host API: "Generation functions + // can be called multiple times on the same generator to generate successive + // blocks of results. For pseudorandom generators, multiple calls to generation + // functions will yield the same result as a single call with a large size."] + // *** NB! REMEMBER THAT THE FORTRAN SAMPLER ALWAYS USES COMMON RANDOM NUMBERS! *** + constexpr unsigned long long seed = 20200805; + m_prnk->seedGenerator( seed + m_iiter ); + m_iiter++; + // --- 1b. Generate all relevant numbers to build nevt events (i.e. nevt phase space points) on the host + m_prnk->generateRnarray(); + //std::cout << "Got random numbers" << std::endl; + // === STEP 2 OF 3 + // --- 2a. Fill in momenta of initial state particles on the device + m_prsk->getMomentaInitial(); + //std::cout << "Got initial momenta" << std::endl; + // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device + // (i.e. map random numbers to final-state particle momenta for each of nevt events) + m_prsk->getMomentaFinal(); + //std::cout << "Got final momenta" << std::endl; + // --- 2c. TransposeC2F + hst_transposeMomentaC2F( m_hstMomenta.data(), fortranMomenta, m_nevt ); + } +} + +//-------------------------------------------------------------------------- + +extern "C" +{ +#ifdef __CUDACC__ + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + + /** + * The floating point precision used in Fortran arrays. + * This is presently hardcoded to double precision (REAL*8). + */ + using FORTRANFPTYPE = double; // for Fortran double precision (REAL*8) arrays + //using FORTRANFPTYPE = float; // for Fortran single precision (REAL*4) arrays + + /** + * Create a Sampler and return its pointer. + * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f). + * + * @param ppsampler the pointer to the Sampler pointer (the Sampler pointer is handled in Fortran as an INTEGER*8 variable) + * @param nevtF the pointer to the number of events in the Fortran arrays + * @param nparF the pointer to the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + * @param np4F the pointer to the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY) + */ + void fsamplercreate_( CppObjectInFortran** ppsampler, const int* pnevtF, const int* pnparF, const int* pnp4F ) + { + *ppsampler = new Sampler( *pnevtF, *pnparF, *pnp4F ); + } + + /** + * Delete a Sampler. + * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f). + * + * @param ppsampler the pointer to the Sampler pointer (the Sampler pointer is handled in Fortran as an INTEGER*8 variable) + */ + void fsamplerdelete_( CppObjectInFortran** ppsampler ) + { + Sampler* psampler = dynamic_cast*>( *ppsampler ); + if( psampler == 0 ) throw std::runtime_error( "fsamplerdelete_: invalid Sampler address" ); + delete psampler; + } + + /** + * Execute the matrix-element calculation "sequence" via a Sampler on GPU/CUDA or CUDA/C++. + * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f). + * + * @param ppsampler the pointer to the Sampler pointer (the Sampler pointer is handled in Fortran as an INTEGER*8 variable) + * @param momenta the pointer to the input 4-momenta + * @param mes the pointer to the output matrix elements + */ + void fsamplersequence_( CppObjectInFortran** ppsampler, FORTRANFPTYPE* momenta ) + { + Sampler* psampler = dynamic_cast*>( *ppsampler ); + if( psampler == 0 ) throw std::runtime_error( "fsamplersequence_: invalid Sampler address" ); + // Use the host/CPU implementation (there is no device implementation) + psampler->samplerHostSequence( momenta ); + } +} + +//-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.inc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.inc new file mode 100644 index 0000000000..d4895df206 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.inc @@ -0,0 +1,37 @@ +C +C Create a Sampler and return its pointer +C - PSAMPLER: the memory address of the C++ Sampler +C - NEVT: the number of events in the Fortran arrays +C - NPAR: the number of external particles in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?) +C - NP4: the number of momenta components, usually 4, in the Fortran arrays (KEPT FOR SANITY CHECKS ONLY: remove it?) +C + INTERFACE + SUBROUTINE FSAMPLERCREATE(PSAMPLER, NEVT, NPAR, NP4) + INTEGER*8 PSAMPLER + INTEGER*4 NEVT + INTEGER*4 NPAR + INTEGER*4 NP4 + END SUBROUTINE FSAMPLERCREATE + END INTERFACE + +C +C Delete a Sampler. +C - PSAMPLER: the memory address of the C++ Sampler +C + INTERFACE + SUBROUTINE FSAMPLERDELETE(PSAMPLER) + INTEGER*8 PSAMPLER + END SUBROUTINE FSAMPLERDELETE + END INTERFACE + +C +C Execute the matrix-element calculation "sequence" via a Sampler on GPU/CUDA or CUDA/C++. +C - PSAMPLER: the memory address of the C++ Sampler +C - MOMENTA: the output 4-momenta Fortran array +C + INTERFACE + SUBROUTINE FSAMPLERSEQUENCE(PSAMPLER, MOMENTA) + INTEGER*8 PSAMPLER + DOUBLE PRECISION MOMENTA(*) + END SUBROUTINE FSAMPLERSEQUENCE + END INTERFACE diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/nvtx.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/nvtx.h new file mode 100644 index 0000000000..e206b8e075 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/nvtx.h @@ -0,0 +1,69 @@ +#ifndef MGONGPUNVTX_H +#define MGONGPUNVTX_H 1 + +// Provides macros for simply use of NVTX, if a compiler macro USE_NVTX is defined. +// Original author Peter Heywood +// With a few modifications by Andrea Valassi + +//------------------------------------------- +// NVTX is enabled +//------------------------------------------- + +#ifdef USE_NVTX + +#include + +// This assumes CUDA 10.0+ +#include "nvtx3/nvToolsExt.h" + +// Scope some things into a namespace +namespace nvtx +{ + + // Colour palette (RGB): https://colorbrewer2.org/#type=qualitative&scheme=Paired&n=12 + const uint32_t palette[] = { 0xffa6cee3, 0xff1f78b4, 0xffb2df8a, 0xff33a02c, 0xfffb9a99, 0xffe31a1c, 0xfffdbf6f, 0xffff7f00, 0xffcab2d6, 0xff6a3d9a, 0xffffff99, 0xffb15928 }; + const uint32_t colourCount = sizeof( palette ) / sizeof( uint32_t ); + + // Inline method to push an nvtx range + inline void push( const char* str, const uint32_t nextColourIdx ) + { + // Get the wrapped colour index + uint32_t colourIdx = nextColourIdx % colourCount; + // Build/populate the struct of nvtx event attributes + nvtxEventAttributes_t eventAttrib = { 0 }; // zero-out the struct (see https://nvidia.github.io/NVTX/doxygen/structnvtx_event_attributes__v2.html) + eventAttrib.version = NVTX_VERSION; + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; + eventAttrib.colorType = NVTX_COLOR_ARGB; + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; + eventAttrib.color = palette[colourIdx]; + eventAttrib.message.ascii = str; + // Push the custom event. + nvtxRangePushEx( &eventAttrib ); + } + + // Inline method to pop an nvtx range + inline void pop() + { + nvtxRangePop(); + } + +} + +// Macro to push an arbitrary nvtx marker +#define NVTX_PUSH( str, idx ) nvtx::push( str, idx ) + +// Macro to pop an arbitrary nvtx marker +#define NVTX_POP() nvtx::pop() + +//------------------------------------------- +// NVTX is not enabled +//------------------------------------------- + +#else + +#define NVTX_PUSH( str, idx ) +#define NVTX_POP() + +#endif + +#endif // MGONGPUNVTX_H 1 diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/ompnumthreads.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/ompnumthreads.h new file mode 100644 index 0000000000..9f8dbbb7f9 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/ompnumthreads.h @@ -0,0 +1,58 @@ +#ifndef OMPNUMTHREADS_H +#define OMPNUMTHREADS_H 1 + +#ifdef _OPENMP + +#include + +#include + +// The OMP_NUM_THREADS environment variable is used to control OMP multi-threading +// By default, all available $(nproc) threads are used if OMP_NUM_THREADS is not set: +// if ompnumthreadsNotSetMeansOneThread is called, only one thread is used instead +inline void +ompnumthreadsNotSetMeansOneThread( int debuglevel ) // quiet(-1), info(0), debug(1) +{ + // Set OMP_NUM_THREADS equal to 1 if it is not yet set + char* ompnthr = getenv( "OMP_NUM_THREADS" ); + if( debuglevel == 1 ) + { + std::cout << "DEBUG: entering ompnumthreadsNotSetMeansOneThread" << std::endl; + std::cout << "DEBUG: omp_get_num_threads() = " + << omp_get_num_threads() << std::endl; // always == 1 here! + std::cout << "DEBUG: omp_get_max_threads() = " + << omp_get_max_threads() << std::endl; + std::cout << "DEBUG: ${OMP_NUM_THREADS} = '" + << ( ompnthr == 0 ? "[not set]" : ompnthr ) << "'" << std::endl; + } + if( ompnthr == NULL || + std::string( ompnthr ).find_first_not_of( "0123456789" ) != std::string::npos || + atol( ompnthr ) == 0 ) + { + if( ompnthr != NULL ) + std::cout << "(ompnumthreadsNotSetMeansOneThread) " + << "WARNING! OMP_NUM_THREADS is invalid: will use only 1 thread" << std::endl; + else if( debuglevel >= 0 ) + std::cout << "(ompnumthreadsNotSetMeansOneThread) " + << "DEBUG: OMP_NUM_THREADS is not set: will use only 1 thread" << std::endl; + omp_set_num_threads( 1 ); // https://stackoverflow.com/a/22816325 + if( debuglevel == 1 ) + { + std::cout << "DEBUG: omp_get_num_threads() = " + << omp_get_num_threads() << std::endl; // always == 1 here! + std::cout << "DEBUG: omp_get_max_threads() = " + << omp_get_max_threads() << std::endl; + } + } + else if( debuglevel >= 0 ) + std::cout << "(ompnumthreadsNotSetMeansOneThread) " + << "DEBUG: OMP_NUM_THREADS = " << ompnthr << std::endl; + if( debuglevel >= 0 ) + std::cout << "(ompnumthreadsNotSetMeansOneThread) " + << "omp_get_max_threads() = " << omp_get_max_threads() << std::endl; + if( debuglevel == 1 ) + std::cout << "DEBUG: exiting ompnumthreadsNotSetMeansOneThread" << std::endl; +} +#endif + +#endif // OMPNUMTHREADS_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/perf.py b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/perf.py new file mode 100644 index 0000000000..63f4c714a7 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/perf.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 + +from optparse import OptionParser +from datetime import datetime +from mpl_toolkits.mplot3d import Axes3D # noqa: F401 +import matplotlib.pyplot as plt +from matplotlib import cm +from matplotlib.ticker import ScalarFormatter +import numpy as np +import copy +import sys +import json +from operator import itemgetter + + +class Perf(): + + def __init__(self, date, run, x, y, z, xrem, yrem, loc): + perffile = '%s/%s-perf-test-run%s.json' % (loc, date, run) + data = open(perffile, 'r') + readJson = json.loads(data.read()) + data.close() + self.axesn = [x, y, z] + self.axesr = [xrem, yrem] # remove outer bands from axes + self.axesv = [[], [], []] + self.data = self.prepData(readJson) + + def prepData(self, jsonData): + for data in jsonData: + for i in data: + if isinstance(data[i], type('test')): + idx = -1 + if data[i].find("sec") != -1: + idx = data[i].find("sec") + elif data[i].find("GEV") != -1: + idx = data[i].find("GeV") + + if idx != -1: + data[i] = float(data[i][:idx - 1]) + return jsonData + + def prepAxes3D(self): + for d in self.data: + ks = list(d.keys()) + for ax in self.axesn: + idx = self.axesn.index(ax) + axlist = self.axesv[idx] + if ax in ks: + axval = d[ax] + if axval not in axlist: + axlist.append(axval) + else: + print('Error: cannot find axes name %s in %s' % (ax, d)) + if len(self.axesv[0]) * len(self.axesv[1]) != len(self.axesv[2]): + print('Error: axes don\'t match x * y != z (%d * %d != %d' % + (len(self.axesv[0]), len(self.axesv[1]), len(self.axesv[2]))) + self.axesv[0].sort() + self.axesv[1].sort() + self.axesv[0] = self.axesv[0][self.axesr[0]:] # sr + self.axesv[1] = self.axesv[1][self.axesr[1]:] # sr + + def prepData3D(self): + xlen = len(self.axesv[0]) + ylen = len(self.axesv[1]) + self.data2d = [] + ylist = [0] * ylen + for i in range(xlen): + self.data2d.append(copy.deepcopy(ylist)) + for d in self.data: + xpos = -1 + ypos = -1 + if d[self.axesn[0]] in self.axesv[0]: + xpos = self.axesv[0].index(d[self.axesn[0]]) + if d[self.axesn[1]] in self.axesv[1]: + ypos = self.axesv[1].index(d[self.axesn[1]]) + if xpos != -1 and ypos != -1: + zval = d[self.axesn[2]] + self.data2d[xpos][ypos] = zval + + def plot3D(self): + self.prepAxes3D() + self.prepData3D() + + data_array = np.array(self.data2d) + fig = plt.figure() + ax = fig.add_subplot(111, projection='3d') + x_data, y_data = np.meshgrid(np.arange(data_array.shape[1]), + np.arange(data_array.shape[0])) + xticks = x_data[0] + yticks = np.array(list(range(len(y_data)))) + x_data = x_data.flatten() + y_data = y_data.flatten() + z_data = data_array.flatten() + ax.set_xlabel(self.axesn[1], {'fontsize': 'small'}) + ax.set_xticks(xticks) + # consider 'fontsize': 'small' for dict also yticklabels + ax.set_xticklabels(self.axesv[1], {'rotation': 45, 'fontsize': 'small'}) + ax.set_ylabel(self.axesn[0], {'fontsize': 'small'}) + ax.set_yticks(yticks) + # consider 'fontsize': 'small' for dict + ax.set_yticklabels(self.axesv[0], {'rotation': 45, 'fontsize': 'small'}) + ax.set_zlabel(self.axesn[2], {'fontsize': 'small'}) + # ax.set_zscale('log') + # z_data = np.log10(z_data) + ax.bar3d(x_data, y_data, np.zeros(len(z_data)), 1, 1, z_data) + plt.show() + + def prepData2D(self): + self.dataDict2D = {} + xname = self.axesn[0] + yname = self.axesn[1] + zname = self.axesn[2] + + for d in self.data: + xval = d[xname] + yval = d[yname] + zval = d[zname] + dim = xval * yval + tick = '%s/%s' % (str(xval), str(yval)) + vallist = [float(str(zval).split()[0]), tick] + if dim not in self.dataDict2D: + self.dataDict2D[dim] = [vallist] + else: + self.dataDict2D[dim].append(vallist) + + def plot2D(self): + self.prepData2D() + + # use this value to plot a flat line for the cpu values to compare with + cpuval = 0 + # cpuval = 79766.84 # tot + # cpuval = 427251.1 # rmb + me + # cpuval = 472578.7 # me + + cmap = {'32': 'red', '64': 'orange', '128': 'blue', '256': 'green'} + smap = {'32': 20, '64': 40, '128': 80, '256': 160} + + dims = list(self.dataDict2D.keys()) + dims.sort() + xlist = list(range(1, len(dims) + 1)) + ylist = [] + clist = [] + slist = [] + ylabels = [] + for d in dims: + ysublist = [] + for y in self.dataDict2D[d]: + ysublist.append(y) # y[0] + ysublist = sorted(ysublist, key=itemgetter(0), reverse=True) + clist.append([cmap[x[1].split('/')[0]] for x in ysublist]) + slist.append([smap[x[1].split('/')[0]] for x in ysublist]) + # Temporary conversion for total time for events -> events per sec + # ysublist[0][0] = d / ysublist[0][0] + ylabels.append([x[1] for x in ysublist]) + ylist.append([x[0] for x in ysublist]) + + fig, ax = plt.subplots() + print(xlist) + print(ylist) + for xe, ye, ce, se in zip(xlist, ylist, clist, slist): + print([xe] * len(ye)) + ax.scatter([xe] * len(ye), ye, s=se, facecolors='none', + edgecolors=ce) + if cpuval: + ax.scatter(xe, cpuval, marker='+', c='dimgrey') + + ax.set_xticks(xlist) + ax.set_xlabel('%s * %s' % (self.axesn[0], self.axesn[1])) + ax.set_ylabel('%s' % (self.axesn[2])) + ax.set_yscale('log') + ax.set_xticklabels(dims, rotation=45) + ax.yaxis.set_major_formatter(ScalarFormatter()) + plt.ticklabel_format(axis="y", style="sci", scilimits=(0, 0)) + # Commenting only for the current example due to an overlap of the + # product labels + # xpos = 1 + # for y in ylabels: + # xstr = '' + # for x in y: + # # xstr += x.replace('/', '\n') + # xstr += x + # xstr += '\n' + # ax.text(xpos, 1, xstr, {'fontsize': 'xx-small', + # 'ha': 'center', + # 'va': 'bottom'}) + # xpos += 1 + + handlelist = [] + for k in cmap: + handlelist.append(plt.scatter([], [], s=smap[k], marker='o', + color=cmap[k], facecolor='none')) + + print(handlelist) + plt.legend(handlelist, [str(x) for x in cmap.keys()], + title="# threads / block") + + plt.show() + + def plotStack(self, threads=32): + collist = ['Purples', 'Blues', 'Greens', 'Oranges', 'Reds', 'Greys'] + # collist = ['tab20b', 'tab20c'] + + bars = {} + blocks = [] + for d in self.data: + if d['NumThreadsPerBlock'] == threads: + blocks.append(d['NumBlocksPerGrid']) + for k in d: + if k[0].isdigit(): + if k not in bars: + bars[k] = [] + + barks = list(bars.keys()) + barks.sort() + blocks.sort() + + for d in self.data: + if d['NumThreadsPerBlock'] == threads: + for b in barks: + if b in d: + bars[b].append(d[b]) + else: + bars[b].append(0) + + ind = np.arange(len(bars[barks[0]])) + width = 0.35 + + plts = [] + ci = -1 + cj = 0.5 + plts.append(plt.bar(ind, bars[barks[0]], width, edgecolor='black', + color='white')) + bot = [0] * len(bars[barks[0]]) + for i in range(1, len(barks)): + colcod = barks[i][:2] + if colcod[1] == 'a': + ci += 1 + cj = 0.5 + else: + cj += 0.1 + print(colcod, ci, cj, bot[-1], barks[i]) + col = cm.get_cmap(collist[ci])(cj) + sumlist = [] + for (l1, l2) in zip(bot, bars[barks[i - 1]]): + sumlist.append(l1 + l2) + bot = sumlist + plts.append(plt.bar(ind, bars[barks[i]], width, + bottom=bot, color=col, edgecolor=col)) + + plt.ylabel('seconds') + plts.reverse() + barks.reverse() + plt.xticks(ind, [str(x) for x in blocks], rotation=45) + plt.legend([x[0] for x in plts], barks) + + plt.show() + + +# import numpy as np +# import matplotlib.pyplot as plt +# +# N = 5 +# menMeans = (20, 35, 30, 35, 27) +# womenMeans = (25, 32, 34, 20, 25) +# menStd = (2, 3, 4, 1, 2) +# womenStd = (3, 5, 2, 3, 3) +# ind = np.arange(N) # the x locations for the groups +# width = 0.35 # the width of the bars: can also be len(x) sequence +# +# p1 = plt.bar(ind, menMeans, width, yerr=menStd) +# p2 = plt.bar(ind, womenMeans, width, +# bottom=menMeans, yerr=womenStd) +# +# plt.ylabel('Scores') +# plt.title('Scores by group and gender') +# plt.xticks(ind, ('G1', 'G2', 'G3', 'G4', 'G5')) +# plt.yticks(np.arange(0, 81, 10)) +# plt.legend((p1[0], p2[0]), ('Men', 'Women')) +# +# plt.show() + +def print_keys(loc, date, run): + perffile = '%s/%s-perf-test-run%s.json' % (loc, date, run) + data = open(perffile, 'r') + readJson = json.loads(data.read()) + data.close() + for k in list(readJson[0].keys()): + print(k) + + +if __name__ == '__main__': + + n = datetime.now() + today = str(n.year) + str(n.month).rjust(2, '0') + str(n.day).rjust(2, '0') + parser = OptionParser() + parser.add_option('-l', '--location', dest='dir', default='data', + help='directory with data (default: data)') + parser.add_option('-d', '--date', dest='date', default=today, + help='date of data files YYYYMMDD (default: today)') + parser.add_option('-r', '--run', default='1', dest='run', + help='run number (default: 1)') + parser.add_option('-x', dest='xax', default='NumThreadsPerBlock', + help='variable name for x axis \ + (default: NumThreadsPerBlock)') + parser.add_option('-y', dest='yax', default='NumBlocksPerGrid', + help='variable name for y axis \ + (default: NumBlocksPerGrid)') + parser.add_option('-z', dest='zax', default='TotalTimeInWaveFuncs', + help='variable name for z axis \ + (default: TotalTimeInWaveFuncs)') + parser.add_option('--xrm', dest='xrm', default=0, + help='# of outer x dimensions to remove') + parser.add_option('--yrm', dest='yrm', default=0, + help='# of outer y dimensions to remove') + parser.add_option('-k', '--keys', dest='keys', action='store_true', + help='print available keys from data') + + (op, ar) = parser.parse_args() + + plotnames = ['2D', '3D', 'STACK'] + plot = '2D' + + xrm = 0 + yrm = 0 + if op.xrm: + xrm = int(op.xrm) + if op.yrm: + yrm = int(op.yrm) + + if op.keys: + print_keys(op.dir, op.date, op.run) + sys.exit(0) + + if (len(ar) == 1 and ar[0].upper() not in plotnames) or len(ar) > 1: + print(parser.print_help()) + sys.exit(1) + elif len(ar) == 1: + plot = ar[0].upper() + + p = Perf(op.date, op.run, op.xax, op.yax, op.zax, xrm, yrm, op.dir) + if plot == '3D': + p.plot3D() + if plot == '2D': + p.plot2D() + if plot == 'STACK': + p.plotStack() diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/profile.sh b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/profile.sh new file mode 100755 index 0000000000..1d60fa3542 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/profile.sh @@ -0,0 +1,182 @@ +#!/bin/bash + +usage(){ + echo "Usage (GUI analysis): $0 -l label [-cc] [-p #blocks #threads #iterations]" + echo "Usage (CL analysis): $0 -nogui [-p #blocks #threads #iterations]" + exit 1 +} + +# Default options +tag=cu +###cuargs="16384 32 12" # NEW DEFAULT 2020.08.10 (faster on local, and allows comparison to global and shared memory) +###ccargs=" 256 32 12" # Similar to cuda config, but faster than using "16384 32 12" +##cuargs="16384 32 2" # faster tests +##ccargs=" 256 32 2" # faster tests +cuargs="2048 256 1" # NEW DEFAULT 2021.04.06 (matches "-p 2048 256 12" but only one iteration) +ccargs="2048 256 1" # NEW DEFAULT 2021.04.06 (matches "-p 2048 256 12" but only one iteration) +args= +label= + +# Command line arguments +while [ "$1" != "" ]; do + # Profile C++ instead of cuda + if [ "$1" == "-cc" ]; then + if [ "$tag" != "nogui" ]; then + tag=cc + shift + else + echo "ERROR! Incompatible options -gui and -cc" + usage + fi + # Fast no-GUI profiling with ncu + elif [ "$1" == "-nogui" ]; then + if [ "$tag" != "cc" ]; then + tag=nogui + shift + else + echo "ERROR! Incompatible options -gui and -cc" + usage + fi + # Override blocks/threads/iterations + # (NB do not exceed 12 iterations: profiling overhead per iteration is huge) + elif [ "$1" == "-p" ]; then + if [ "$4" != "" ]; then + args="$2 $3 $4" + shift 4 + else + usage + fi + # Label + elif [ "$1" == "-l" ]; then + if [ "$2" != "" ]; then + label="$2" + shift 2 + else + usage + fi + # Invalid arguments + else + usage + fi +done + +if [ "$tag" == "cc" ]; then + if [ "$args" == "" ]; then args=$ccargs; fi + cmd="./check.exe -p $args" + make +else + if [ "$args" == "" ]; then args=$cuargs; fi + cmd="./gcheck.exe -p $args" + make +fi + +ncu="ncu" +nsys="nsys" +ncugui="ncu-ui &" +nsysgui="nsight-sys &" + +# Settings specific to CERN condor/batch nodes +###host=$(hostname) +###if [ "${host%%cern.ch}" != "${host}" ] && [ "${host##b}" != "${host}" ]; then +### ncu=/usr/local/cuda-11.0/bin/ncu +### ###nsys=/usr/local/cuda-10.1/bin/nsys +### ###nsys=/usr/local/cuda-10.2/bin/nsys +### nsys=/cvmfs/sft.cern.ch/lcg/releases/cuda/11.0RC-d9c38/x86_64-centos7-gcc62-opt/bin/nsys +### ncugui="Launch the Nsight Compute GUI from Windows" +### nsysgui="Launch the Nsight System GUI from Windows" +###fi + +# Settings specific to CERN IT/SC nodes +# (nsys 11.4 and 11.5 fail with 'boost::wrapexcept') +host=$(hostname) +if [ "${host%%cern.ch}" != "${host}" ] && [ "${host##itsc}" != "${host}" ]; then + CUDA_NSIGHT_HOME=/usr/local/cuda-11.1 + echo "Using Nsight from ${CUDA_NSIGHT_HOME}" + ncu=${CUDA_NSIGHT_HOME}/bin/ncu + nsys=${CUDA_NSIGHT_HOME}/bin/nsys + ncugui="${CUDA_NSIGHT_HOME}/bin/ncu-ui &" + nsysgui="${CUDA_NSIGHT_HOME}/bin/nsight-sys &" +fi + +# Set the ncu sampling period (default is auto) +# The value is in the range [0..31], the actual period is 2**(5+value) cycles. +###ncu="${ncu} --sampling-interval 0" # MAX sampling frequency +###ncu="${ncu} --sampling-interval 31" # MIN sampling frequency + +# METRICS FOR COALESCED MEMORY ACCESS (AOSOA etc) +# See https://developer.nvidia.com/blog/using-nsight-compute-to-inspect-your-kernels/ +# These used to be called gld_transactions and global_load_requests +# See also https://docs.nvidia.com/nsight-compute/2019.5/NsightComputeCli/index.html#nvprof-metric-comparison +# See also https://stackoverflow.com/questions/60535867 +metrics=l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_requests_pipe_lsu_mem_global_op_ld.sum + +# METRICS FOR REGISTER PRESSURE +metrics+=,launch__registers_per_thread + +# METRICS FOR DIVERGENCE +metrics+=,sm__sass_average_branch_targets_threads_uniform.pct + +# GUI analysis +if [ "$tag" != "nogui" ]; then + + if [ "$label" == "" ]; then + echo "ERROR! You must specify a label" + usage + fi + + arg1=$(echo $args | cut -d' ' -f1) + arg2=$(echo $args | cut -d' ' -f2) + arg3=$(echo $args | cut -d' ' -f3) + + ###if [ "${host%%raplab*}" != "${host}" ]; then + ### logs=nsight_logs_raplab + ###elif [ "${host%%cern.ch}" != "${host}" ] && [ "${host##b}" != "${host}" ]; then + ### logs=nsight_logs_lxbatch + ###else + ### logs=nsight_logs + ###fi + logs=nsight_logs + + if [ ! -d $logs ]; then mkdir -p $logs; fi + trace=$logs/Sigma_sm_gg_ttxgg_${tag}_`date +%m%d_%H%M`_b${arg1}_t${arg2}_i${arg3} + if [ "$label" != "" ]; then trace=${trace}_${label}; fi + + echo + echo "PROFILING: ${cmd}" + echo "OUTPUT: ${trace}.*" + echo + + \rm -f ${trace}.* + + hostname > ${trace}.txt + echo "nproc=$(nproc)" >> ${trace}.txt + echo >> ${trace}.txt + ( time ${cmd} ) 2>&1 | tee -a ${trace}.txt + nvidia-smi -q -d CLOCK >> ${trace}.txt + + if [ "$tag" == "cu" ]; then + echo + echo "${ncu} --set full --metrics ${metrics} -o ${trace} ${cmd}" + echo + ${ncu} --set full --metrics ${metrics} -o ${trace} ${cmd} + fi + echo + echo "${nsys} profile -o ${trace} ${cmd}" + echo + ${nsys} profile -o ${trace} ${cmd} + echo "" + echo "TO ANALYSE TRACE FILES:" + echo " ${ncugui}" + echo " ${nsysgui}" + +# NO-GUI analysis +else + + echo + echo "PROFILING: ${cmd}" + echo "${ncu} --metrics ${metrics} ${cmd}" + echo + echo sudo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} $(which ${ncu}) --metrics ${metrics} --target-processes all ${cmd} + sudo LD_LIBRARY_PATH=${LD_LIBRARY_PATH} $(which ${ncu}) --metrics ${metrics} --target-processes all ${cmd} + +fi diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc new file mode 100644 index 0000000000..a1cec39ced --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc @@ -0,0 +1,251 @@ +#include "mgOnGpuConfig.h" + +#include "CPPProcess.h" +#include "MadgraphTest.h" +#include "MatrixElementKernels.h" +#include "MemoryAccessMatrixElements.h" +#include "MemoryAccessMomenta.h" +#include "MemoryBuffers.h" +#include "RamboSamplingKernels.h" +#include "RandomNumberKernels.h" +#include "epoch_process_id.h" + +#ifdef __CUDACC__ +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + +struct CUDA_CPU_TestBase : public TestDriverBase +{ + static constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + static constexpr int np4 = mgOnGpu::np4; + static constexpr int npar = mgOnGpu::npar; + static_assert( gputhreads % neppM == 0, "ERROR! #threads/block should be a multiple of neppM" ); + static_assert( gputhreads <= mgOnGpu::ntpbMAX, "ERROR! #threads/block should be <= ntpbMAX" ); + CUDA_CPU_TestBase( const std::string& refFileName ) + : TestDriverBase( npar, refFileName ) {} +}; + +#ifndef __CUDACC__ +struct CPUTest : public CUDA_CPU_TestBase +{ + // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) + // [NB the hst/dev memory arrays must be initialised in the constructor, see issue #290] + CPPProcess process; + HostBufferRndNumMomenta hstRndmom; + HostBufferMomenta hstMomenta; + HostBufferGs hstGs; + HostBufferRndNumHelicity hstRndHel; + HostBufferRndNumColor hstRndCol; + HostBufferWeights hstWeights; + HostBufferMatrixElements hstMatrixElements; + HostBufferSelectedHelicity hstSelHel; + HostBufferSelectedColor hstSelCol; + HostBufferHelicityMask hstIsGoodHel; + + // Create a process object + // Read param_card and set parameters + // ** WARNING EVIL EVIL ** + // The CPPProcess constructor has side effects on the globals Proc::cHel, which is needed in ME calculations. + // Don't remove! + CPUTest( const std::string& refFileName ) + : CUDA_CPU_TestBase( refFileName ) + , process( /*verbose=*/false ) + , hstRndmom( nevt ) + , hstMomenta( nevt ) + , hstGs( nevt ) + , hstRndHel( nevt ) + , hstRndCol( nevt ) + , hstWeights( nevt ) + , hstMatrixElements( nevt ) + , hstSelHel( nevt ) + , hstSelCol( nevt ) + , hstIsGoodHel( mgOnGpu::ncomb ) + { + process.initProc( "../../Cards/param_card.dat" ); + } + + virtual ~CPUTest() {} + + void prepareRandomNumbers( unsigned int iiter ) override + { + CommonRandomNumberKernel rnk( hstRndmom ); + rnk.seedGenerator( 1337 + iiter ); + rnk.generateRnarray(); + } + + void prepareMomenta( fptype energy ) override + { + RamboSamplingKernelHost rsk( energy, hstRndmom, hstMomenta, hstWeights, nevt ); + // --- 2a. Fill in momenta of initial state particles on the device + rsk.getMomentaInitial(); + // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device + // (i.e. map random numbers to final-state particle momenta for each of nevt events) + rsk.getMomentaFinal(); + } + + void runSigmaKin( std::size_t iiter ) override + { + constexpr fptype fixedG = 1.2177157847767195; // fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) + for( unsigned int i = 0; i < nevt; ++i ) hstGs[i] = fixedG; + MatrixElementKernelHost mek( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ); + if( iiter == 0 ) mek.computeGoodHelicities(); + constexpr unsigned int channelId = 0; // TEMPORARY? disable multi-channel in runTest.exe #466 + mek.computeMatrixElements( channelId ); + } + + fptype getMomentum( std::size_t ievt, unsigned int ipar, unsigned int ip4 ) const override + { + assert( ipar < npar ); + assert( ip4 < np4 ); + return MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, ip4, ipar ); + } + + fptype getMatrixElement( std::size_t ievt ) const override + { + return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); + } +}; +#endif + +#ifdef __CUDACC__ +struct CUDATest : public CUDA_CPU_TestBase +{ + // Reset the device when our test goes out of scope. Note that this should happen after + // the frees, i.e. be declared before the pointers to device memory. + struct DeviceReset + { + ~DeviceReset() + { + checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + } + } deviceResetter; + + // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) + // [NB the hst/dev memory arrays must be initialised in the constructor, see issue #290] + CPPProcess process; + PinnedHostBufferRndNumMomenta hstRndmom; + PinnedHostBufferMomenta hstMomenta; + PinnedHostBufferGs hstGs; + PinnedHostBufferRndNumHelicity hstRndHel; + PinnedHostBufferRndNumColor hstRndCol; + PinnedHostBufferWeights hstWeights; + PinnedHostBufferMatrixElements hstMatrixElements; + PinnedHostBufferSelectedHelicity hstSelHel; + PinnedHostBufferSelectedColor hstSelCol; + PinnedHostBufferHelicityMask hstIsGoodHel; + DeviceBufferRndNumMomenta devRndmom; + DeviceBufferMomenta devMomenta; + DeviceBufferGs devGs; + DeviceBufferRndNumHelicity devRndHel; + DeviceBufferRndNumColor devRndCol; + DeviceBufferWeights devWeights; + DeviceBufferMatrixElements devMatrixElements; + DeviceBufferSelectedHelicity devSelHel; + DeviceBufferSelectedColor devSelCol; + DeviceBufferHelicityMask devIsGoodHel; + + // Create a process object + // Read param_card and set parameters + // ** WARNING EVIL EVIL ** + // The CPPProcess constructor has side effects on the globals Proc::cHel, which is needed in ME calculations. + // Don't remove! + CUDATest( const std::string& refFileName ) + : CUDA_CPU_TestBase( refFileName ) + , process( /*verbose=*/false ) + , hstRndmom( nevt ) + , hstMomenta( nevt ) + , hstGs( nevt ) + , hstRndHel( nevt ) + , hstRndCol( nevt ) + , hstWeights( nevt ) + , hstMatrixElements( nevt ) + , hstSelHel( nevt ) + , hstSelCol( nevt ) + , hstIsGoodHel( mgOnGpu::ncomb ) + , devRndmom( nevt ) + , devMomenta( nevt ) + , devGs( nevt ) + , devRndHel( nevt ) + , devRndCol( nevt ) + , devWeights( nevt ) + , devMatrixElements( nevt ) + , devSelHel( nevt ) + , devSelCol( nevt ) + , devIsGoodHel( mgOnGpu::ncomb ) + { + process.initProc( "../../Cards/param_card.dat" ); + } + + virtual ~CUDATest() {} + + void prepareRandomNumbers( unsigned int iiter ) override + { + CommonRandomNumberKernel rnk( hstRndmom ); + rnk.seedGenerator( 1337 + iiter ); + rnk.generateRnarray(); + copyDeviceFromHost( devRndmom, hstRndmom ); + } + + void prepareMomenta( fptype energy ) override + { + RamboSamplingKernelDevice rsk( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ); + // --- 2a. Fill in momenta of initial state particles on the device + rsk.getMomentaInitial(); + // --- 2b. Fill in momenta of final state particles using the RAMBO algorithm on the device + // (i.e. map random numbers to final-state particle momenta for each of nevt events) + rsk.getMomentaFinal(); + // --- 2c. CopyDToH Weights + copyHostFromDevice( hstWeights, devWeights ); + // --- 2d. CopyDToH Momenta + copyHostFromDevice( hstMomenta, devMomenta ); + } + + void runSigmaKin( std::size_t iiter ) override + { + constexpr fptype fixedG = 1.2177157847767195; // fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) + for( unsigned int i = 0; i < nevt; ++i ) hstGs[i] = fixedG; + copyDeviceFromHost( devGs, hstGs ); // BUG FIX #566 + MatrixElementKernelDevice mek( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ); + if( iiter == 0 ) mek.computeGoodHelicities(); + constexpr unsigned int channelId = 0; // TEMPORARY? disable multi-channel in runTest.exe #466 + mek.computeMatrixElements( channelId ); + copyHostFromDevice( hstMatrixElements, devMatrixElements ); + } + + fptype getMomentum( std::size_t ievt, unsigned int ipar, unsigned int ip4 ) const override + { + assert( ipar < npar ); + assert( ip4 < np4 ); + return MemoryAccessMomenta::ieventAccessIp4IparConst( hstMomenta.data(), ievt, ip4, ipar ); + } + + fptype getMatrixElement( std::size_t ievt ) const override + { + return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); + } +}; +#endif + +// Use two levels of macros to force stringification at the right level +// (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) +// Google macro is in https://github.com/google/googletest/blob/master/googletest/include/gtest/gtest-param-test.h +#define TESTID_CPU( s ) s##_CPU +#define XTESTID_CPU( s ) TESTID_CPU( s ) +#define MG_INSTANTIATE_TEST_SUITE_CPU( prefix, test_suite_name ) \ +INSTANTIATE_TEST_SUITE_P( prefix, \ + test_suite_name, \ + testing::Values( new CPUTest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); +#define TESTID_GPU( s ) s##_GPU +#define XTESTID_GPU( s ) TESTID_GPU( s ) +#define MG_INSTANTIATE_TEST_SUITE_GPU( prefix, test_suite_name ) \ +INSTANTIATE_TEST_SUITE_P( prefix, \ + test_suite_name, \ + testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); + +#ifdef __CUDACC__ +MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); +#else +MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); +#endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc new file mode 100644 index 0000000000..5fa8ac70fe --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc @@ -0,0 +1,217 @@ +// Use ./runTest.exe --gtest_filter=*misc to run only this test + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "epoch_process_id.h" + +#include + +#include +#include + +#ifdef __CUDACC__ +#define TESTID( s ) s##_GPU_MISC +#else +#define TESTID( s ) s##_CPU_MISC +#endif + +#define XTESTID( s ) TESTID( s ) + +#ifdef MGONGPU_CPPSIMD /* clang-format off */ +bool maskand( const bool_v& mask ){ bool out = true; for ( int i=0; i=1] + EXPECT_TRUE( ( f[i] == 0 ) ); // equals 0, not 1 + } +#endif + } + +#ifdef MGONGPU_CPPSIMD + // Vector initialization for cxtype_sv - demonstrate fix for bug #339 + { + fptype_sv f1 = fptype_v{ 0 } + 1; + EXPECT_TRUE_sv( f1 == 1 ); + cxtype_v c12 = cxmake( f1, 2 ); + //std::cout << c12 << std::endl << boolTF( c12.real() == 1 ) << std::endl << boolTF( c12.imag() == 2 ) << std::endl; + EXPECT_TRUE_sv( c12.real() == 1 ); + EXPECT_TRUE_sv( c12.imag() == 2 ); + cxtype_v c21 = cxmake( 2, f1 ); + //std::cout << c21 << std::endl << boolTF( c21.real() == 2 ) << std::endl << boolTF( c21.imag() == 1 ) << std::endl; + EXPECT_TRUE_sv( c21.real() == 2 ); + EXPECT_TRUE_sv( c21.imag() == 1 ); + } +#endif + + // Vector initialization for cxtype_sv + { + cxtype_sv c = cxzero_sv(); + EXPECT_TRUE_sv( c.real() == 0 ); + EXPECT_TRUE_sv( c.imag() == 0 ); + } + { + cxtype_sv c = cxmake( 1, fptype_sv{ 0 } ); // here was a bug #339 + EXPECT_TRUE_sv( c.real() == 1 ); + EXPECT_TRUE_sv( c.imag() == 0 ); + } + { + cxtype_sv c = cxmake( fptype_sv{ 0 }, 1 ); // here was a bug #339 + EXPECT_TRUE_sv( c.real() == 0 ); + EXPECT_TRUE_sv( c.imag() == 1 ); + } + + // Array initialization for cxtype_sv array (example: jamp_sv in CPPProcess.cc) + { + cxtype_sv array[2] = {}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxype is NOT, if "= {}" is missing!) + //std::cout << array[0].real() << std::endl; std::cout << boolTF( array[0].real() == 0 ) << std::endl; + EXPECT_TRUE_sv( array[0].real() == 0 ); + EXPECT_TRUE_sv( array[0].imag() == 0 ); + EXPECT_TRUE_sv( array[1].real() == 0 ); + EXPECT_TRUE_sv( array[1].imag() == 0 ); + } + + // Alternative array initialization for cxtype_sv array (example: was used for outwf in testxxx.cc) + { + cxtype_sv array[2]{}; // all zeros (NB: vector cxtype_v IS initialized to 0, but scalar cxype is NOT, if "{}" is missing!) + //std::cout << array[0].real() << std::endl; std::cout << boolTF( array[0].real() == 0 ) << std::endl; + EXPECT_TRUE_sv( array[0].real() == 0 ); + EXPECT_TRUE_sv( array[0].imag() == 0 ); + EXPECT_TRUE_sv( array[1].real() == 0 ); + EXPECT_TRUE_sv( array[1].imag() == 0 ); + } + + //-------------------------------------------------------------------------- + + // Scalar complex references + { + using namespace mgOnGpu; + // Refs to f1, f2 + fptype f1 = 1; + fptype f2 = 2; + cxtype_ref r12( f1, f2 ); // copy refs + //cxtype_ref r12a( r12 ); //deleted + cxtype_ref r12a( cxtype_ref( f1, f2 ) ); // copy refs + //cxtype_ref r12b = r12; // deleted + cxtype_ref r12b = cxtype_ref( f1, f2 ); // copy refs + EXPECT_TRUE( cxtype( r12 ).real() == 1 ); + EXPECT_TRUE( cxtype( r12 ).imag() == 2 ); + EXPECT_TRUE( cxtype( r12a ).real() == 1 ); + EXPECT_TRUE( cxtype( r12a ).imag() == 2 ); + EXPECT_TRUE( cxtype( r12b ).real() == 1 ); + EXPECT_TRUE( cxtype( r12b ).imag() == 2 ); + // Refs to f1c, f2c + fptype f1c = 0; + fptype f2c = 0; + cxtype_ref r12c( f1c, f2c ); + EXPECT_TRUE( cxtype( r12c ).real() == 0 ); + EXPECT_TRUE( cxtype( r12c ).imag() == 0 ); + //r12c = r12; // deleted + r12c = cxtype( r12 ); // copy values + EXPECT_TRUE( cxtype( r12c ).real() == 1 ); + EXPECT_TRUE( cxtype( r12c ).imag() == 2 ); + // Update f1, f2 + f1 = 10; + f2 = 20; + EXPECT_TRUE( cxtype( r12 ).real() == 10 ); + EXPECT_TRUE( cxtype( r12 ).imag() == 20 ); + EXPECT_TRUE( cxtype( r12a ).real() == 10 ); + EXPECT_TRUE( cxtype( r12a ).imag() == 20 ); + EXPECT_TRUE( cxtype( r12b ).real() == 10 ); + EXPECT_TRUE( cxtype( r12b ).imag() == 20 ); + EXPECT_TRUE( cxtype( r12c ).real() == 1 ); // points to f1c, not to f1 + EXPECT_TRUE( cxtype( r12c ).imag() == 2 ); // points to f2c, not to f2 + } + + // Vector complex references + { + using namespace mgOnGpu; + // Refs to f1, f2 + fptype_sv f1 = fptype_sv{ 0 } + 1; + fptype_sv f2 = fptype_sv{ 0 } + 2; + cxtype_sv_ref r12( f1, f2 ); // copy refs + //cxtype_sv_ref r12a( r12 ); //deleted + cxtype_sv_ref r12a( cxtype_sv_ref( f1, f2 ) ); // copy refs + //cxtype_sv_ref r12b = r12; // deleted + cxtype_sv_ref r12b = cxtype_sv_ref( f1, f2 ); // copy refs + EXPECT_TRUE_sv( cxtype_sv( r12 ).real() == 1 ); + EXPECT_TRUE_sv( cxtype_sv( r12 ).imag() == 2 ); + EXPECT_TRUE_sv( cxtype_sv( r12a ).real() == 1 ); + EXPECT_TRUE_sv( cxtype_sv( r12a ).imag() == 2 ); + EXPECT_TRUE_sv( cxtype_sv( r12b ).real() == 1 ); + EXPECT_TRUE_sv( cxtype_sv( r12b ).imag() == 2 ); + // Refs to f1c, f2c + fptype_sv f1c = fptype_sv{ 0 }; + fptype_sv f2c = fptype_sv{ 0 }; + cxtype_sv_ref r12c( f1c, f2c ); + EXPECT_TRUE_sv( cxtype_sv( r12c ).real() == 0 ); + EXPECT_TRUE_sv( cxtype_sv( r12c ).imag() == 0 ); + //r12c = r12; // deleted + r12c = cxtype_sv( r12 ); // copy values + EXPECT_TRUE_sv( cxtype_sv( r12c ).real() == 1 ); + EXPECT_TRUE_sv( cxtype_sv( r12c ).imag() == 2 ); + // Update f1, f2 + f1 = fptype_sv{ 0 } + 10; + f2 = fptype_sv{ 0 } + 20; + EXPECT_TRUE_sv( cxtype_sv( r12 ).real() == 10 ); + EXPECT_TRUE_sv( cxtype_sv( r12 ).imag() == 20 ); + EXPECT_TRUE_sv( cxtype_sv( r12a ).real() == 10 ); + EXPECT_TRUE_sv( cxtype_sv( r12a ).imag() == 20 ); + EXPECT_TRUE_sv( cxtype_sv( r12b ).real() == 10 ); + EXPECT_TRUE_sv( cxtype_sv( r12b ).imag() == 20 ); + EXPECT_TRUE_sv( cxtype_sv( r12c ).real() == 1 ); // points to f1c, not to f1 + EXPECT_TRUE_sv( cxtype_sv( r12c ).imag() == 2 ); // points to f2c, not to f2 + } + + //-------------------------------------------------------------------------- +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx.cc new file mode 100644 index 0000000000..1052022dd8 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx.cc @@ -0,0 +1,323 @@ +#include "mgOnGpuConfig.h" + +#include "CPPProcess.h" +#include "HelAmps_MSSM_SLHA2.h" +#include "MemoryAccessMomenta.h" +#include "MemoryAccessWavefunctions.h" +#include "MemoryBuffers.h" +#include "epoch_process_id.h" + +#include + +#include +#include +#include +#include +#include +#include +#ifdef __CUDACC__ +#define TESTID( s ) s##_GPU_XXX +#else +#define TESTID( s ) s##_CPU_XXX +#endif + +#define XTESTID( s ) TESTID( s ) + +TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) +{ + constexpr bool dumpEvents = false; // dump the expected output of the test? + constexpr bool testEvents = !dumpEvents; // run the test? + constexpr fptype toleranceXXXs = std::is_same::value ? 1.E-15 : 1.E-5; + // Constant parameters + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout + using mgOnGpu::neppV; + using mgOnGpu::np4; + using mgOnGpu::npar; + const int nevt = 16; // 12 independent tests plus 4 duplicates (need a multiple of 8 for floats or for '512z') + assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM + assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV + // Fill in the input momenta +#ifdef __CUDACC__ + mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] +#else + mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] +#endif /* clang-format off */ + const fptype par0[np4 * nevt] = // AOS[nevt][np4] + { + 500, 0, 0, 500, // #0 (m=0 pT=0 E=pz>0) + 500, 0, 0, -500, // #1 (m=0 pT=0 -E=pz<0) + 500, 300, 400, 0, // #2 (m=0 pT>0 pz=0) + 500, 180, 240, 400, // #3 (m=0 pT>0 pz>0) + 500, 180, 240, -400, // #4 (m=0 pT>0 pz<0) + 500, 0, 0, 0, // #5 (m=50>0 pT=0 pz=0) + 500, 0, 0, 300, // #6 (m=40>0 pT=0 pz>0) + 500, 0, 0, -300, // #7 (m=40>0 pT=0 pz<0) + 500, 180, 240, 0, // #8 (m=40>0 pT>0 pz=0) + 500, -240, -180, 0, // #9 (m=40>0 pT>0 pz=0) + 500, 180, 192, 144, // #10 (m=40>0 pT>0 pz>0) + 500, 180, 192, -144, // #11 (m=40>0 pT>0 pz<0) + 500, 0, 0, 500, // DUPLICATE #12 == #0 (m=0 pT=0 E=pz>0) + 500, 0, 0, -500, // DUPLICATE #13 == #1 (m=0 pT=0 -E=pz<0) + 500, 300, 400, 0, // DUPLICATE #14 == #2 (m=0 pT>0 pz=0) + 500, 180, 240, 400 // DUPLICATE #15 == #3 (m=0 pT>0 pz>0) + }; /* clang-format on */ + // Array initialization: zero-out as "{0}" (C and C++) or as "{}" (C++ only) + // See https://en.cppreference.com/w/c/language/array_initialization#Notes + fptype mass0[nevt] = {}; + bool ispzgt0[nevt] = {}; + bool ispzlt0[nevt] = {}; + bool isptgt0[nevt] = {}; + for( int ievt = 0; ievt < nevt; ievt++ ) + { + const fptype p0 = par0[ievt * np4 + 0]; + const fptype p1 = par0[ievt * np4 + 1]; + const fptype p2 = par0[ievt * np4 + 2]; + const fptype p3 = par0[ievt * np4 + 3]; + mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + ispzgt0[ievt] = ( p3 > 0 ); + ispzlt0[ievt] = ( p3 < 0 ); + isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); + } + const int ipar0 = 0; // use only particle0 for this test + for( int ievt = 0; ievt < nevt; ievt++ ) + { + for( int ip4 = 0; ip4 < np4; ip4++ ) + { + MemoryAccessMomenta::ieventAccessIp4Ipar( hstMomenta.data(), ievt, ip4, ipar0 ) = par0[ievt * np4 + ip4]; // AOS to AOSOA + } + } + // Expected output wavefunctions + std::vector> expwfs; +#include "testxxx_cc_ref.txt" // expwfs.push_back( {...} ); + std::string dumpFileName = "testxxx_cc_ref.txt.new"; + // Compute the output wavefunctions + // Dump new reference file if requested + using mgOnGpu::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + int itest = 0; // index on the expected output vector + std::ofstream dumpFile; + if( dumpEvents ) dumpFile.open( dumpFileName, std::ios::trunc ); + auto dumpwf6 = [&]( std::ostream& out, const cxtype_sv wf[6], const char* xxx, int ievt, int nsp, fptype mass ) + { + out << std::setprecision( 15 ) << std::scientific; + out << " expwfs.push_back( {"; + out << " // ---------" << std::endl; + for( int iw6 = 0; iw6 < nw6; iw6++ ) + { +#ifdef MGONGPU_CPPSIMD + const int ieppV = ievt % neppV; // #event in the current event vector in this iteration +#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK + out << std::setw( 26 ) << cxreal( wf[iw6][ieppV] ) << ", "; + out << std::setw( 22 ) << cximag( wf[iw6][ieppV] ); +#else + out << std::setw( 26 ) << wf[iw6].real()[ieppV] << ", "; + out << std::setw( 22 ) << wf[iw6].imag()[ieppV]; +#endif +#else + out << std::setw( 26 ) << wf[iw6].real(); + out << ", " << std::setw( 22 ) << wf[iw6].imag(); +#endif + if( iw6 < nw6 - 1 ) + out << ", "; + else + out << " } );"; + out << " // itest=" << itest << ": " << xxx << "#" << ievt; + out << " nsp=" << nsp << " mass=" << (int)mass << std::endl; + } + out << std::defaultfloat; + }; + auto testwf6 = [&]( const cxtype_sv wf[6], const char* xxx, int ievt, int nsp, fptype mass ) + { + if( dumpEvents ) dumpwf6( dumpFile, wf, xxx, ievt, nsp, mass ); + if( testEvents ) + { + std::array& expwf = expwfs[itest]; + //std::cout << "Testing " << std::setw(3) << itest << ": " << xxx << " #" << ievt << std::endl; + ////for ( int iw6 = 0; iw6( outwfI ); // proof of concept for using fptype* in the interface + fptype* fp_outwfO = reinterpret_cast( outwfO ); // proof of concept for using fptype* in the interface + fptype* fp_outwf = reinterpret_cast( outwf ); // proof of concept for using fptype* in the interface + fptype* fp_outwf3 = reinterpret_cast( outwf3 ); // proof of concept for using fptype* in the interface + const int nhel = 1; + for( auto nsp: { -1, +1 } ) // antifermion/fermion (or initial/final for scalar and vector) + { + for( int ievt = 0; ievt < nevt; ievt++ ) + { +#ifdef __CUDACC__ + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif + if( false ) + { + std::cout << std::endl; + for( int ip4 = 0; ip4 < np4; ip4++ ) std::cout << par0[ievt * np4 + ip4] << ", "; + std::cout << std::endl; + } + const int ipagV = ievt / neppV; // #event vector in this iteration + const fptype* ievt0Momenta = MemoryAccessMomenta::ieventAccessRecordConst( hstMomenta.data(), ipagV * neppV ); + // Test ixxxxx - NO ASSUMPTIONS + { + const fptype fmass = mass0[ievt]; + ixxxxx( ievt0Momenta, fmass, nhel, nsp, fp_outwfI, ipar0 ); + testwf6( outwfI, "ixxxxx", ievt, nsp, fmass ); + ixxxxx( ievt0Momenta, -fmass, nhel, nsp, fp_outwfI, ipar0 ); + testwf6( outwfI, "ixxxxx", ievt, nsp, -fmass ); + } + // Test ipzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0) + if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzgt0[ievt] ) + { + ipzxxx( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 ); + testwf6two( outwf, outwfI, "ipzxxx", ievt ); + testwf6( outwf, "ipzxxx", ievt, nsp, 0 ); + } + // Test imzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0) + if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzlt0[ievt] ) + { + imzxxx( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 ); + testwf6two( outwf, outwfI, "imzxxx", ievt ); + testwf6( outwf, "imzxxx", ievt, nsp, 0 ); + } + // Test ixzxxx - ASSUMPTIONS: (FMASS == 0) and (PT > 0) + if( mass0[ievt] == 0 && isptgt0[ievt] ) + { + ixzxxx( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 ); + testwf6two( outwf, outwfI, "ixzxxx", ievt ); + testwf6( outwf, "ixzxxx", ievt, nsp, 0 ); + } + // Test vxxxxx - NO ASSUMPTIONS + { + const fptype vmass = mass0[ievt]; + vxxxxx( ievt0Momenta, vmass, nhel, nsp, fp_outwf, ipar0 ); + testwf6( outwf, "vxxxxx", ievt, nsp, vmass ); + vxxxxx( ievt0Momenta, -vmass, nhel, nsp, fp_outwf, ipar0 ); + testwf6( outwf, "vxxxxx", ievt, nsp, -vmass ); + } + // Test sxxxxx - NO ASSUMPTIONS + { + const fptype smass = mass0[ievt]; + sxxxxx( ievt0Momenta, nsp, fp_outwf3, ipar0 ); // no mass, no helicity (was "smass>0") + testwf6( outwf3, "sxxxxx", ievt, nsp, smass ); + sxxxxx( ievt0Momenta, nsp, fp_outwf3, ipar0 ); // no mass, no helicity (was "smass<0") + testwf6( outwf3, "sxxxxx", ievt, nsp, -smass ); + } + // Test oxxxxx - NO ASSUMPTIONS + { + const fptype fmass = mass0[ievt]; + oxxxxx( ievt0Momenta, fmass, nhel, nsp, fp_outwfO, ipar0 ); + testwf6( outwfO, "oxxxxx", ievt, nsp, fmass ); + oxxxxx( ievt0Momenta, -fmass, nhel, nsp, fp_outwfO, ipar0 ); + testwf6( outwfO, "oxxxxx", ievt, nsp, -fmass ); + } + // Test opzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0) + if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzgt0[ievt] ) + { + opzxxx( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 ); + testwf6two( outwf, outwfO, "opzxxx", ievt ); + testwf6( outwf, "opzxxx", ievt, nsp, 0 ); + } + // Test omzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0) + if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzlt0[ievt] ) + { + omzxxx( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 ); + testwf6two( outwf, outwfO, "omzxxx", ievt ); + testwf6( outwf, "omzxxx", ievt, nsp, 0 ); + } + // Test oxzxxx - ASSUMPTIONS: (FMASS == 0) and (PT > 0) + if( mass0[ievt] == 0 && isptgt0[ievt] ) + { + oxzxxx( ievt0Momenta, nhel, nsp, reinterpret_cast( outwf ), ipar0 ); + testwf6two( outwf, outwfO, "oxzxxx", ievt ); + testwf6( outwf, "oxzxxx", ievt, nsp, 0 ); + } + } + } + if( dumpEvents ) + { + dumpFile.close(); + std::cout << "INFO: New reference data dumped to file '" << dumpFileName << "'" << std::endl; + } +} + +//========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx_cc_ref.txt b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx_cc_ref.txt new file mode 100644 index 0000000000..8bc0384a68 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx_cc_ref.txt @@ -0,0 +1,2044 @@ + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=0: ixxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=0: ixxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=0: ixxxxx#0 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=0: ixxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=0: ixxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=0: ixxxxx#0 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=1: ixxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=1: ixxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=1: ixxxxx#0 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=1: ixxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=1: ixxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=1: ixxxxx#0 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=2: ipzxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=2: ipzxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=2: ipzxxx#0 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=2: ipzxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=2: ipzxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=2: ipzxxx#0 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=3: vxxxxx#0 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=3: vxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=3: vxxxxx#0 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=3: vxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=3: vxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=3: vxxxxx#0 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=4: vxxxxx#0 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=4: vxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=4: vxxxxx#0 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=4: vxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=4: vxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=4: vxxxxx#0 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=5: sxxxxx#0 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=5: sxxxxx#0 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=5: sxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=5: sxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=5: sxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=5: sxxxxx#0 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=6: sxxxxx#0 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=6: sxxxxx#0 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=6: sxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=6: sxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=6: sxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=6: sxxxxx#0 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=7: oxxxxx#0 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=7: oxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=7: oxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=7: oxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=7: oxxxxx#0 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=7: oxxxxx#0 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=8: oxxxxx#0 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=8: oxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=8: oxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=8: oxxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=8: oxxxxx#0 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=8: oxxxxx#0 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=9: opzxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=9: opzxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=9: opzxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=9: opzxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=9: opzxxx#0 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=9: opzxxx#0 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=10: ixxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=10: ixxxxx#1 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=10: ixxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=10: ixxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=10: ixxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=10: ixxxxx#1 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=11: ixxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=11: ixxxxx#1 nsp=-1 mass=0 + -3.162277660168379e+01, -0.000000000000000e+00, // itest=11: ixxxxx#1 nsp=-1 mass=0 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=11: ixxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=11: ixxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=11: ixxxxx#1 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=12: imzxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=12: imzxxx#1 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=12: imzxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=12: imzxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=12: imzxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=12: imzxxx#1 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=13: vxxxxx#1 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=13: vxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=13: vxxxxx#1 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=13: vxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=13: vxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=13: vxxxxx#1 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=14: vxxxxx#1 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=14: vxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=14: vxxxxx#1 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=14: vxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=14: vxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=14: vxxxxx#1 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=15: sxxxxx#1 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=15: sxxxxx#1 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=15: sxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=15: sxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=15: sxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=15: sxxxxx#1 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=16: sxxxxx#1 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=16: sxxxxx#1 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=16: sxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=16: sxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=16: sxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=16: sxxxxx#1 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=17: oxxxxx#1 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=17: oxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=17: oxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=17: oxxxxx#1 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=17: oxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=17: oxxxxx#1 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=18: oxxxxx#1 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=18: oxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=18: oxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=18: oxxxxx#1 nsp=-1 mass=0 + -3.162277660168379e+01, -0.000000000000000e+00, // itest=18: oxxxxx#1 nsp=-1 mass=0 + -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=18: oxxxxx#1 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=19: omzxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=19: omzxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=19: omzxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=19: omzxxx#1 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=19: omzxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=19: omzxxx#1 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=20: ixxxxx#2 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=20: ixxxxx#2 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=20: ixxxxx#2 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=20: ixxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=20: ixxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=20: ixxxxx#2 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=21: ixxxxx#2 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=21: ixxxxx#2 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=21: ixxxxx#2 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=21: ixxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=21: ixxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=21: ixxxxx#2 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=22: ixzxxx#2 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=22: ixzxxx#2 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=22: ixzxxx#2 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=22: ixzxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=22: ixzxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=22: ixzxxx#2 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=23: vxxxxx#2 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=23: vxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=23: vxxxxx#2 nsp=-1 mass=0 + -0.000000000000000e+00, 5.656854249492381e-01, // itest=23: vxxxxx#2 nsp=-1 mass=0 + -0.000000000000000e+00, -4.242640687119285e-01, // itest=23: vxxxxx#2 nsp=-1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=23: vxxxxx#2 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=24: vxxxxx#2 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=24: vxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=24: vxxxxx#2 nsp=-1 mass=0 + -0.000000000000000e+00, 5.656854249492381e-01, // itest=24: vxxxxx#2 nsp=-1 mass=0 + -0.000000000000000e+00, -4.242640687119285e-01, // itest=24: vxxxxx#2 nsp=-1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=24: vxxxxx#2 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=25: sxxxxx#2 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=25: sxxxxx#2 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=25: sxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=25: sxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=25: sxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=25: sxxxxx#2 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=26: sxxxxx#2 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=26: sxxxxx#2 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=26: sxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=26: sxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=26: sxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=26: sxxxxx#2 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=27: oxxxxx#2 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=27: oxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=27: oxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=27: oxxxxx#2 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01, // itest=27: oxxxxx#2 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=27: oxxxxx#2 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=28: oxxxxx#2 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=28: oxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=28: oxxxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=28: oxxxxx#2 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01, // itest=28: oxxxxx#2 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=28: oxxxxx#2 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=29: oxzxxx#2 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=29: oxzxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=29: oxzxxx#2 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=29: oxzxxx#2 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01, // itest=29: oxzxxx#2 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=29: oxzxxx#2 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=30: ixxxxx#3 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=30: ixxxxx#3 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=30: ixxxxx#3 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=30: ixxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=30: ixxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=30: ixxxxx#3 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=31: ixxxxx#3 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=31: ixxxxx#3 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=31: ixxxxx#3 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=31: ixxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=31: ixxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=31: ixxxxx#3 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=32: ixzxxx#3 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=32: ixzxxx#3 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=32: ixzxxx#3 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=32: ixzxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=32: ixzxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=32: ixzxxx#3 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=33: vxxxxx#3 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=33: vxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=33: vxxxxx#3 nsp=-1 mass=0 + -3.394112549695428e-01, 5.656854249492381e-01, // itest=33: vxxxxx#3 nsp=-1 mass=0 + -4.525483399593904e-01, -4.242640687119285e-01, // itest=33: vxxxxx#3 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=33: vxxxxx#3 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=34: vxxxxx#3 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=34: vxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=34: vxxxxx#3 nsp=-1 mass=0 + -3.394112549695428e-01, 5.656854249492381e-01, // itest=34: vxxxxx#3 nsp=-1 mass=0 + -4.525483399593904e-01, -4.242640687119285e-01, // itest=34: vxxxxx#3 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=34: vxxxxx#3 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=35: sxxxxx#3 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=35: sxxxxx#3 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=35: sxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=35: sxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=35: sxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=35: sxxxxx#3 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=36: sxxxxx#3 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=36: sxxxxx#3 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=36: sxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=36: sxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=36: sxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=36: sxxxxx#3 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=37: oxxxxx#3 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=37: oxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=37: oxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=37: oxxxxx#3 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=37: oxxxxx#3 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=37: oxxxxx#3 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=38: oxxxxx#3 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=38: oxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=38: oxxxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=38: oxxxxx#3 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=38: oxxxxx#3 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=38: oxxxxx#3 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=39: oxzxxx#3 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=39: oxzxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=39: oxzxxx#3 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=39: oxzxxx#3 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=39: oxzxxx#3 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=39: oxzxxx#3 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=40: ixxxxx#4 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=40: ixxxxx#4 nsp=-1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=40: ixxxxx#4 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00, // itest=40: ixxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=40: ixxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=40: ixxxxx#4 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=41: ixxxxx#4 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=41: ixxxxx#4 nsp=-1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=41: ixxxxx#4 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00, // itest=41: ixxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=41: ixxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=41: ixxxxx#4 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=42: ixzxxx#4 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=42: ixzxxx#4 nsp=-1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=42: ixzxxx#4 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00, // itest=42: ixzxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=42: ixzxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=42: ixzxxx#4 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=43: vxxxxx#4 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=43: vxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=43: vxxxxx#4 nsp=-1 mass=0 + 3.394112549695428e-01, 5.656854249492381e-01, // itest=43: vxxxxx#4 nsp=-1 mass=0 + 4.525483399593904e-01, -4.242640687119285e-01, // itest=43: vxxxxx#4 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=43: vxxxxx#4 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=44: vxxxxx#4 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=44: vxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=44: vxxxxx#4 nsp=-1 mass=0 + 3.394112549695428e-01, 5.656854249492381e-01, // itest=44: vxxxxx#4 nsp=-1 mass=0 + 4.525483399593904e-01, -4.242640687119285e-01, // itest=44: vxxxxx#4 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=44: vxxxxx#4 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=45: sxxxxx#4 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=45: sxxxxx#4 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=45: sxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=45: sxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=45: sxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=45: sxxxxx#4 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=46: sxxxxx#4 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=46: sxxxxx#4 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=46: sxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=46: sxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=46: sxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=46: sxxxxx#4 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=47: oxxxxx#4 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=47: oxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=47: oxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=47: oxxxxx#4 nsp=-1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01, // itest=47: oxxxxx#4 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=47: oxxxxx#4 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=48: oxxxxx#4 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=48: oxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=48: oxxxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=48: oxxxxx#4 nsp=-1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01, // itest=48: oxxxxx#4 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=48: oxxxxx#4 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=49: oxzxxx#4 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=49: oxzxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=49: oxzxxx#4 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=49: oxzxxx#4 nsp=-1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01, // itest=49: oxzxxx#4 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=49: oxzxxx#4 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=50: ixxxxx#5 nsp=-1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=51: ixxxxx#5 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=52: vxxxxx#5 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=52: vxxxxx#5 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=52: vxxxxx#5 nsp=-1 mass=500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=52: vxxxxx#5 nsp=-1 mass=500 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=52: vxxxxx#5 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=52: vxxxxx#5 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=53: vxxxxx#5 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=53: vxxxxx#5 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=53: vxxxxx#5 nsp=-1 mass=-500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=53: vxxxxx#5 nsp=-1 mass=-500 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=53: vxxxxx#5 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=53: vxxxxx#5 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=54: sxxxxx#5 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=54: sxxxxx#5 nsp=-1 mass=500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=54: sxxxxx#5 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=54: sxxxxx#5 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=54: sxxxxx#5 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=54: sxxxxx#5 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=-500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=55: sxxxxx#5 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=56: oxxxxx#5 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=56: oxxxxx#5 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=56: oxxxxx#5 nsp=-1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=56: oxxxxx#5 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=56: oxxxxx#5 nsp=-1 mass=500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=56: oxxxxx#5 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=57: oxxxxx#5 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=58: ixxxxx#6 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=58: ixxxxx#6 nsp=-1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=58: ixxxxx#6 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=58: ixxxxx#6 nsp=-1 mass=400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=58: ixxxxx#6 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00 } ); // itest=58: ixxxxx#6 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=59: ixxxxx#6 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=59: ixxxxx#6 nsp=-1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=59: ixxxxx#6 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=59: ixxxxx#6 nsp=-1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=59: ixxxxx#6 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=59: ixxxxx#6 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=60: vxxxxx#6 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=60: vxxxxx#6 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=60: vxxxxx#6 nsp=-1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=60: vxxxxx#6 nsp=-1 mass=400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=60: vxxxxx#6 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=60: vxxxxx#6 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=61: vxxxxx#6 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=61: vxxxxx#6 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=61: vxxxxx#6 nsp=-1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=61: vxxxxx#6 nsp=-1 mass=-400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=61: vxxxxx#6 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=61: vxxxxx#6 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=62: sxxxxx#6 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=62: sxxxxx#6 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=62: sxxxxx#6 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=62: sxxxxx#6 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=62: sxxxxx#6 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=62: sxxxxx#6 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=63: sxxxxx#6 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=63: sxxxxx#6 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=63: sxxxxx#6 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=63: sxxxxx#6 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=63: sxxxxx#6 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=63: sxxxxx#6 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=64: oxxxxx#6 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=64: oxxxxx#6 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=64: oxxxxx#6 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=64: oxxxxx#6 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=64: oxxxxx#6 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=64: oxxxxx#6 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=65: oxxxxx#6 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=65: oxxxxx#6 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=65: oxxxxx#6 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=65: oxxxxx#6 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=65: oxxxxx#6 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=65: oxxxxx#6 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=66: ixxxxx#7 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=66: ixxxxx#7 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=66: ixxxxx#7 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=66: ixxxxx#7 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=66: ixxxxx#7 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=66: ixxxxx#7 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=67: ixxxxx#7 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=67: ixxxxx#7 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=67: ixxxxx#7 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=67: ixxxxx#7 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=67: ixxxxx#7 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=67: ixxxxx#7 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=68: vxxxxx#7 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=68: vxxxxx#7 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=68: vxxxxx#7 nsp=-1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=68: vxxxxx#7 nsp=-1 mass=400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=68: vxxxxx#7 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=68: vxxxxx#7 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=69: vxxxxx#7 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=69: vxxxxx#7 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=69: vxxxxx#7 nsp=-1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=69: vxxxxx#7 nsp=-1 mass=-400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=69: vxxxxx#7 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=69: vxxxxx#7 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=70: sxxxxx#7 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=70: sxxxxx#7 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=70: sxxxxx#7 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=70: sxxxxx#7 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=70: sxxxxx#7 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=70: sxxxxx#7 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=71: sxxxxx#7 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=71: sxxxxx#7 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=71: sxxxxx#7 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=71: sxxxxx#7 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=71: sxxxxx#7 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=71: sxxxxx#7 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=72: oxxxxx#7 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=72: oxxxxx#7 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=72: oxxxxx#7 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=72: oxxxxx#7 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=72: oxxxxx#7 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=72: oxxxxx#7 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=73: oxxxxx#7 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=73: oxxxxx#7 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=73: oxxxxx#7 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=73: oxxxxx#7 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=73: oxxxxx#7 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=73: oxxxxx#7 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=74: ixxxxx#8 nsp=-1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=74: ixxxxx#8 nsp=-1 mass=400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=74: ixxxxx#8 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=74: ixxxxx#8 nsp=-1 mass=400 + -5.999999999999999e+00, 7.999999999999999e+00, // itest=74: ixxxxx#8 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=74: ixxxxx#8 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=75: ixxxxx#8 nsp=-1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=75: ixxxxx#8 nsp=-1 mass=-400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=75: ixxxxx#8 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=75: ixxxxx#8 nsp=-1 mass=-400 + 5.999999999999999e+00, -7.999999999999999e+00, // itest=75: ixxxxx#8 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=75: ixxxxx#8 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=76: vxxxxx#8 nsp=-1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=76: vxxxxx#8 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=76: vxxxxx#8 nsp=-1 mass=400 + 0.000000000000000e+00, 5.656854249492381e-01, // itest=76: vxxxxx#8 nsp=-1 mass=400 + 0.000000000000000e+00, -4.242640687119285e-01, // itest=76: vxxxxx#8 nsp=-1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=76: vxxxxx#8 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=77: vxxxxx#8 nsp=-1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=77: vxxxxx#8 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=77: vxxxxx#8 nsp=-1 mass=-400 + -0.000000000000000e+00, 5.656854249492381e-01, // itest=77: vxxxxx#8 nsp=-1 mass=-400 + -0.000000000000000e+00, -4.242640687119285e-01, // itest=77: vxxxxx#8 nsp=-1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=77: vxxxxx#8 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=78: sxxxxx#8 nsp=-1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=78: sxxxxx#8 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=78: sxxxxx#8 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=78: sxxxxx#8 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=78: sxxxxx#8 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=78: sxxxxx#8 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=79: sxxxxx#8 nsp=-1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=79: sxxxxx#8 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=79: sxxxxx#8 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=79: sxxxxx#8 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=79: sxxxxx#8 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=79: sxxxxx#8 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=80: oxxxxx#8 nsp=-1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=80: oxxxxx#8 nsp=-1 mass=400 + -5.999999999999999e+00, -7.999999999999999e+00, // itest=80: oxxxxx#8 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=80: oxxxxx#8 nsp=-1 mass=400 + 1.200000000000000e+01, 1.600000000000000e+01, // itest=80: oxxxxx#8 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=80: oxxxxx#8 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=81: oxxxxx#8 nsp=-1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=81: oxxxxx#8 nsp=-1 mass=-400 + 5.999999999999999e+00, 7.999999999999999e+00, // itest=81: oxxxxx#8 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=81: oxxxxx#8 nsp=-1 mass=-400 + 1.200000000000000e+01, 1.600000000000000e+01, // itest=81: oxxxxx#8 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=81: oxxxxx#8 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=82: ixxxxx#9 nsp=-1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=82: ixxxxx#9 nsp=-1 mass=400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=82: ixxxxx#9 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=82: ixxxxx#9 nsp=-1 mass=400 + 7.999999999999999e+00, -5.999999999999999e+00, // itest=82: ixxxxx#9 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=82: ixxxxx#9 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=83: ixxxxx#9 nsp=-1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=83: ixxxxx#9 nsp=-1 mass=-400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=83: ixxxxx#9 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=83: ixxxxx#9 nsp=-1 mass=-400 + -7.999999999999999e+00, 5.999999999999999e+00, // itest=83: ixxxxx#9 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=83: ixxxxx#9 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=84: vxxxxx#9 nsp=-1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=84: vxxxxx#9 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=84: vxxxxx#9 nsp=-1 mass=400 + 0.000000000000000e+00, -4.242640687119285e-01, // itest=84: vxxxxx#9 nsp=-1 mass=400 + 0.000000000000000e+00, 5.656854249492381e-01, // itest=84: vxxxxx#9 nsp=-1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=84: vxxxxx#9 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=85: vxxxxx#9 nsp=-1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=85: vxxxxx#9 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=85: vxxxxx#9 nsp=-1 mass=-400 + 0.000000000000000e+00, -4.242640687119285e-01, // itest=85: vxxxxx#9 nsp=-1 mass=-400 + 0.000000000000000e+00, 5.656854249492381e-01, // itest=85: vxxxxx#9 nsp=-1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=85: vxxxxx#9 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=86: sxxxxx#9 nsp=-1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=86: sxxxxx#9 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=86: sxxxxx#9 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=86: sxxxxx#9 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=86: sxxxxx#9 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=86: sxxxxx#9 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=87: sxxxxx#9 nsp=-1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=87: sxxxxx#9 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=87: sxxxxx#9 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=87: sxxxxx#9 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=87: sxxxxx#9 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=87: sxxxxx#9 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=88: oxxxxx#9 nsp=-1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=88: oxxxxx#9 nsp=-1 mass=400 + 7.999999999999999e+00, 5.999999999999999e+00, // itest=88: oxxxxx#9 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=88: oxxxxx#9 nsp=-1 mass=400 + -1.600000000000000e+01, -1.200000000000000e+01, // itest=88: oxxxxx#9 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=88: oxxxxx#9 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=89: oxxxxx#9 nsp=-1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=89: oxxxxx#9 nsp=-1 mass=-400 + -7.999999999999999e+00, -5.999999999999999e+00, // itest=89: oxxxxx#9 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=89: oxxxxx#9 nsp=-1 mass=-400 + -1.600000000000000e+01, -1.200000000000000e+01, // itest=89: oxxxxx#9 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=89: oxxxxx#9 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=90: ixxxxx#10 nsp=-1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=90: ixxxxx#10 nsp=-1 mass=400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=90: ixxxxx#10 nsp=-1 mass=400 + -2.433105012119288e+01, -0.000000000000000e+00, // itest=90: ixxxxx#10 nsp=-1 mass=400 + -4.931969619160719e+00, 5.260767593771432e+00, // itest=90: ixxxxx#10 nsp=-1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00 } ); // itest=90: ixxxxx#10 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=91: ixxxxx#10 nsp=-1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=91: ixxxxx#10 nsp=-1 mass=-400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=91: ixxxxx#10 nsp=-1 mass=-400 + -2.433105012119288e+01, -0.000000000000000e+00, // itest=91: ixxxxx#10 nsp=-1 mass=-400 + 4.931969619160719e+00, -5.260767593771432e+00, // itest=91: ixxxxx#10 nsp=-1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00 } ); // itest=91: ixxxxx#10 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=92: vxxxxx#10 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=92: vxxxxx#10 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=92: vxxxxx#10 nsp=-1 mass=400 + -2.321373168788980e-01, 5.158607041753289e-01, // itest=92: vxxxxx#10 nsp=-1 mass=400 + -2.476131380041579e-01, -4.836194101643708e-01, // itest=92: vxxxxx#10 nsp=-1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=92: vxxxxx#10 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=93: vxxxxx#10 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=93: vxxxxx#10 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=93: vxxxxx#10 nsp=-1 mass=-400 + -2.321373168788980e-01, 5.158607041753289e-01, // itest=93: vxxxxx#10 nsp=-1 mass=-400 + -2.476131380041579e-01, -4.836194101643708e-01, // itest=93: vxxxxx#10 nsp=-1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=93: vxxxxx#10 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=94: sxxxxx#10 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=94: sxxxxx#10 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=94: sxxxxx#10 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=94: sxxxxx#10 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=94: sxxxxx#10 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=94: sxxxxx#10 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=95: sxxxxx#10 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=95: sxxxxx#10 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=95: sxxxxx#10 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=95: sxxxxx#10 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=95: sxxxxx#10 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=95: sxxxxx#10 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=96: oxxxxx#10 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=96: oxxxxx#10 nsp=-1 mass=400 + -4.931969619160719e+00, -5.260767593771432e+00, // itest=96: oxxxxx#10 nsp=-1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00, // itest=96: oxxxxx#10 nsp=-1 mass=400 + 9.863939238321439e+00, 1.052153518754287e+01, // itest=96: oxxxxx#10 nsp=-1 mass=400 + -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=96: oxxxxx#10 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=97: oxxxxx#10 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=97: oxxxxx#10 nsp=-1 mass=-400 + 4.931969619160719e+00, 5.260767593771432e+00, // itest=97: oxxxxx#10 nsp=-1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00, // itest=97: oxxxxx#10 nsp=-1 mass=-400 + 9.863939238321439e+00, 1.052153518754287e+01, // itest=97: oxxxxx#10 nsp=-1 mass=-400 + -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=97: oxxxxx#10 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=98: ixxxxx#11 nsp=-1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=98: ixxxxx#11 nsp=-1 mass=400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=98: ixxxxx#11 nsp=-1 mass=400 + -1.442220510185596e+01, -0.000000000000000e+00, // itest=98: ixxxxx#11 nsp=-1 mass=400 + -8.320502943378436e+00, 8.875203139603666e+00, // itest=98: ixxxxx#11 nsp=-1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00 } ); // itest=98: ixxxxx#11 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=99: ixxxxx#11 nsp=-1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=99: ixxxxx#11 nsp=-1 mass=-400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=99: ixxxxx#11 nsp=-1 mass=-400 + -1.442220510185596e+01, -0.000000000000000e+00, // itest=99: ixxxxx#11 nsp=-1 mass=-400 + 8.320502943378436e+00, -8.875203139603666e+00, // itest=99: ixxxxx#11 nsp=-1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00 } ); // itest=99: ixxxxx#11 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=100: vxxxxx#11 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=100: vxxxxx#11 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=100: vxxxxx#11 nsp=-1 mass=400 + 2.321373168788980e-01, 5.158607041753289e-01, // itest=100: vxxxxx#11 nsp=-1 mass=400 + 2.476131380041579e-01, -4.836194101643708e-01, // itest=100: vxxxxx#11 nsp=-1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=100: vxxxxx#11 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=101: vxxxxx#11 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=101: vxxxxx#11 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=101: vxxxxx#11 nsp=-1 mass=-400 + 2.321373168788980e-01, 5.158607041753289e-01, // itest=101: vxxxxx#11 nsp=-1 mass=-400 + 2.476131380041579e-01, -4.836194101643708e-01, // itest=101: vxxxxx#11 nsp=-1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=101: vxxxxx#11 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=102: sxxxxx#11 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=102: sxxxxx#11 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=102: sxxxxx#11 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=102: sxxxxx#11 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=102: sxxxxx#11 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=102: sxxxxx#11 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=103: sxxxxx#11 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=103: sxxxxx#11 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=103: sxxxxx#11 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=103: sxxxxx#11 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=103: sxxxxx#11 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=103: sxxxxx#11 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=104: oxxxxx#11 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=104: oxxxxx#11 nsp=-1 mass=400 + -8.320502943378436e+00, -8.875203139603666e+00, // itest=104: oxxxxx#11 nsp=-1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00, // itest=104: oxxxxx#11 nsp=-1 mass=400 + 1.664100588675688e+01, 1.775040627920733e+01, // itest=104: oxxxxx#11 nsp=-1 mass=400 + -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=104: oxxxxx#11 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=105: oxxxxx#11 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=105: oxxxxx#11 nsp=-1 mass=-400 + 8.320502943378436e+00, 8.875203139603666e+00, // itest=105: oxxxxx#11 nsp=-1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00, // itest=105: oxxxxx#11 nsp=-1 mass=-400 + 1.664100588675688e+01, 1.775040627920733e+01, // itest=105: oxxxxx#11 nsp=-1 mass=-400 + -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=105: oxxxxx#11 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=106: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=106: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=106: ixxxxx#12 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=106: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=106: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=106: ixxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=107: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=107: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=107: ixxxxx#12 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=107: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=107: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=107: ixxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=108: ipzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=108: ipzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=108: ipzxxx#12 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=108: ipzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=108: ipzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=108: ipzxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=109: vxxxxx#12 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=109: vxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=109: vxxxxx#12 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=109: vxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=109: vxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=109: vxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=110: vxxxxx#12 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=110: vxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=110: vxxxxx#12 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=110: vxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=110: vxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=110: vxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=111: sxxxxx#12 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=111: sxxxxx#12 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=111: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=111: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=111: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=111: sxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=112: sxxxxx#12 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=112: sxxxxx#12 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=112: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=112: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=112: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=112: sxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=113: oxxxxx#12 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=113: oxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=113: oxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=113: oxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=113: oxxxxx#12 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=113: oxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=114: oxxxxx#12 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=114: oxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=114: oxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=114: oxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=114: oxxxxx#12 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=114: oxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=115: opzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=115: opzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=115: opzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=115: opzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=115: opzxxx#12 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=115: opzxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=116: ixxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=116: ixxxxx#13 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=116: ixxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=116: ixxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=116: ixxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=116: ixxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=117: ixxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=117: ixxxxx#13 nsp=-1 mass=0 + -3.162277660168379e+01, -0.000000000000000e+00, // itest=117: ixxxxx#13 nsp=-1 mass=0 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=117: ixxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=117: ixxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=117: ixxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=118: imzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=118: imzxxx#13 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=118: imzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=118: imzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=118: imzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=118: imzxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=119: vxxxxx#13 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=119: vxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=119: vxxxxx#13 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=119: vxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=119: vxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=119: vxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=120: vxxxxx#13 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=120: vxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=120: vxxxxx#13 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=120: vxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=120: vxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=120: vxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=121: sxxxxx#13 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=121: sxxxxx#13 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=121: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=121: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=121: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=121: sxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=122: sxxxxx#13 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=122: sxxxxx#13 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=122: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=122: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=122: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=122: sxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=123: oxxxxx#13 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=123: oxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=123: oxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=123: oxxxxx#13 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=123: oxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=123: oxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=124: oxxxxx#13 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=124: oxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=124: oxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=124: oxxxxx#13 nsp=-1 mass=0 + -3.162277660168379e+01, -0.000000000000000e+00, // itest=124: oxxxxx#13 nsp=-1 mass=0 + -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=124: oxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=125: omzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=125: omzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=125: omzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=125: omzxxx#13 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=125: omzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=125: omzxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=126: ixxxxx#14 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=126: ixxxxx#14 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=126: ixxxxx#14 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=126: ixxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=126: ixxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=126: ixxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=127: ixxxxx#14 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=127: ixxxxx#14 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=127: ixxxxx#14 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=127: ixxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=127: ixxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=127: ixxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=128: ixzxxx#14 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=128: ixzxxx#14 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=128: ixzxxx#14 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=128: ixzxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=128: ixzxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=128: ixzxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=129: vxxxxx#14 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=129: vxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=129: vxxxxx#14 nsp=-1 mass=0 + -0.000000000000000e+00, 5.656854249492381e-01, // itest=129: vxxxxx#14 nsp=-1 mass=0 + -0.000000000000000e+00, -4.242640687119285e-01, // itest=129: vxxxxx#14 nsp=-1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=129: vxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=130: vxxxxx#14 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=130: vxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=130: vxxxxx#14 nsp=-1 mass=0 + -0.000000000000000e+00, 5.656854249492381e-01, // itest=130: vxxxxx#14 nsp=-1 mass=0 + -0.000000000000000e+00, -4.242640687119285e-01, // itest=130: vxxxxx#14 nsp=-1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=130: vxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=131: sxxxxx#14 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=131: sxxxxx#14 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=131: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=131: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=131: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=131: sxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=132: sxxxxx#14 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=132: sxxxxx#14 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=132: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=132: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=132: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=132: sxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=133: oxxxxx#14 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=133: oxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=133: oxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=133: oxxxxx#14 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01, // itest=133: oxxxxx#14 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=133: oxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=134: oxxxxx#14 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=134: oxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=134: oxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=134: oxxxxx#14 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01, // itest=134: oxxxxx#14 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=134: oxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=135: oxzxxx#14 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=135: oxzxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=135: oxzxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=135: oxzxxx#14 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01, // itest=135: oxzxxx#14 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=135: oxzxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=136: ixxxxx#15 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=136: ixxxxx#15 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=136: ixxxxx#15 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=136: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=136: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=136: ixxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=137: ixxxxx#15 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=137: ixxxxx#15 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=137: ixxxxx#15 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=137: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=137: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=137: ixxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=138: ixzxxx#15 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=138: ixzxxx#15 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=138: ixzxxx#15 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=138: ixzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=138: ixzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=138: ixzxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=139: vxxxxx#15 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=139: vxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=139: vxxxxx#15 nsp=-1 mass=0 + -3.394112549695428e-01, 5.656854249492381e-01, // itest=139: vxxxxx#15 nsp=-1 mass=0 + -4.525483399593904e-01, -4.242640687119285e-01, // itest=139: vxxxxx#15 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=139: vxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=140: vxxxxx#15 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=140: vxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=140: vxxxxx#15 nsp=-1 mass=0 + -3.394112549695428e-01, 5.656854249492381e-01, // itest=140: vxxxxx#15 nsp=-1 mass=0 + -4.525483399593904e-01, -4.242640687119285e-01, // itest=140: vxxxxx#15 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=140: vxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=141: sxxxxx#15 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=141: sxxxxx#15 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=141: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=141: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=141: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=141: sxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=142: sxxxxx#15 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=142: sxxxxx#15 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=142: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=142: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=142: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=142: sxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=143: oxxxxx#15 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=143: oxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=143: oxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=143: oxxxxx#15 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=143: oxxxxx#15 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=143: oxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=144: oxxxxx#15 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=144: oxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=144: oxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=144: oxxxxx#15 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=144: oxxxxx#15 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=144: oxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=145: oxzxxx#15 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=145: oxzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=145: oxzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=145: oxzxxx#15 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=145: oxzxxx#15 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=145: oxzxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=146: ixxxxx#0 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=146: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=146: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=146: ixxxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=146: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=146: ixxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=147: ixxxxx#0 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=147: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=147: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=147: ixxxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=147: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=147: ixxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=148: ipzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=148: ipzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=148: ipzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=148: ipzxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=148: ipzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=148: ipzxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=149: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=149: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=149: vxxxxx#0 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=149: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=149: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=149: vxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=150: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=150: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=150: vxxxxx#0 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=150: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=150: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=150: vxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=151: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=151: sxxxxx#0 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=151: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=151: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=151: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=151: sxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=152: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=152: sxxxxx#0 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=152: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=152: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=152: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=152: sxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=153: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=153: oxxxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=153: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=153: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=153: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=153: oxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=154: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=154: oxxxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=154: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=154: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=154: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=154: oxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=155: opzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=155: opzxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=155: opzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=155: opzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=155: opzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=155: opzxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=156: ixxxxx#1 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=156: ixxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=156: ixxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=156: ixxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=156: ixxxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=156: ixxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=157: ixxxxx#1 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=157: ixxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=157: ixxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=157: ixxxxx#1 nsp=1 mass=0 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=157: ixxxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, -0.000000000000000e+00 } ); // itest=157: ixxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=158: imzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=158: imzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=158: imzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=158: imzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=158: imzxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=158: imzxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=159: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=159: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=159: vxxxxx#1 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=159: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=159: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=159: vxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=160: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=160: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=160: vxxxxx#1 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=160: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=160: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=160: vxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=161: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=161: sxxxxx#1 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=161: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=161: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=161: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=161: sxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=162: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=162: sxxxxx#1 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=162: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=162: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=162: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=162: sxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=163: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=163: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=163: oxxxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=163: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=163: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=163: oxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=164: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=164: oxxxxx#1 nsp=1 mass=0 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=164: oxxxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, -0.000000000000000e+00, // itest=164: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=164: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=164: oxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=165: omzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=165: omzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=165: omzxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=165: omzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=165: omzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=165: omzxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=166: ixxxxx#2 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=166: ixxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=166: ixxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=166: ixxxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=166: ixxxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=166: ixxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=167: ixxxxx#2 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=167: ixxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=167: ixxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=167: ixxxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=167: ixxxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=167: ixxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=168: ixzxxx#2 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=168: ixzxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=168: ixzxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=168: ixzxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=168: ixzxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=168: ixzxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=169: vxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=169: vxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=169: vxxxxx#2 nsp=1 mass=0 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=169: vxxxxx#2 nsp=1 mass=0 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=169: vxxxxx#2 nsp=1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=169: vxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=170: vxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=170: vxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=170: vxxxxx#2 nsp=1 mass=0 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=170: vxxxxx#2 nsp=1 mass=0 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=170: vxxxxx#2 nsp=1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=170: vxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=171: sxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=171: sxxxxx#2 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=171: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=171: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=171: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=171: sxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=172: sxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=172: sxxxxx#2 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=172: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=172: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=172: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=172: sxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=173: oxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=173: oxxxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=173: oxxxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=173: oxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=173: oxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=173: oxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=174: oxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=174: oxxxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=174: oxxxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=174: oxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=174: oxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=174: oxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=175: oxzxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=175: oxzxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=175: oxzxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=175: oxzxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=175: oxzxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=175: oxzxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=176: ixxxxx#3 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=176: ixxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=176: ixxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=176: ixxxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=176: ixxxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=176: ixxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=177: ixxxxx#3 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=177: ixxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=177: ixxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=177: ixxxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=177: ixxxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=177: ixxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=178: ixzxxx#3 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=178: ixzxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=178: ixzxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=178: ixzxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=178: ixzxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=178: ixzxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=179: vxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=179: vxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=179: vxxxxx#3 nsp=1 mass=0 + -3.394112549695428e-01, -5.656854249492381e-01, // itest=179: vxxxxx#3 nsp=1 mass=0 + -4.525483399593904e-01, 4.242640687119285e-01, // itest=179: vxxxxx#3 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=179: vxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=180: vxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=180: vxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=180: vxxxxx#3 nsp=1 mass=0 + -3.394112549695428e-01, -5.656854249492381e-01, // itest=180: vxxxxx#3 nsp=1 mass=0 + -4.525483399593904e-01, 4.242640687119285e-01, // itest=180: vxxxxx#3 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=180: vxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=181: sxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=181: sxxxxx#3 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=181: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=181: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=181: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=181: sxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=182: sxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=182: sxxxxx#3 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=182: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=182: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=182: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=182: sxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=183: oxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=183: oxxxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=183: oxxxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=183: oxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=183: oxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=183: oxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=184: oxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=184: oxxxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=184: oxxxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=184: oxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=184: oxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=184: oxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=185: oxzxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=185: oxzxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=185: oxzxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=185: oxzxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=185: oxzxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=185: oxzxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=186: ixxxxx#4 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=186: ixxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=186: ixxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=186: ixxxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=186: ixxxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=186: ixxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=187: ixxxxx#4 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=187: ixxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=187: ixxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=187: ixxxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=187: ixxxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=187: ixxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=188: ixzxxx#4 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=188: ixzxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=188: ixzxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=188: ixzxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=188: ixzxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=188: ixzxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=189: vxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=189: vxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=189: vxxxxx#4 nsp=1 mass=0 + 3.394112549695428e-01, -5.656854249492381e-01, // itest=189: vxxxxx#4 nsp=1 mass=0 + 4.525483399593904e-01, 4.242640687119285e-01, // itest=189: vxxxxx#4 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=189: vxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=190: vxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=190: vxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=190: vxxxxx#4 nsp=1 mass=0 + 3.394112549695428e-01, -5.656854249492381e-01, // itest=190: vxxxxx#4 nsp=1 mass=0 + 4.525483399593904e-01, 4.242640687119285e-01, // itest=190: vxxxxx#4 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=190: vxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=191: sxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=191: sxxxxx#4 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=191: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=191: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=191: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=191: sxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=192: sxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=192: sxxxxx#4 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=192: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=192: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=192: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=192: sxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=193: oxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=193: oxxxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=193: oxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=193: oxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=193: oxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=193: oxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=194: oxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=194: oxxxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=194: oxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=194: oxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=194: oxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=194: oxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=195: oxzxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=195: oxzxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=195: oxzxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=195: oxzxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=195: oxzxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=195: oxzxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=196: ixxxxx#5 nsp=1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=196: ixxxxx#5 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=196: ixxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=196: ixxxxx#5 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=196: ixxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=196: ixxxxx#5 nsp=1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=197: ixxxxx#5 nsp=1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=197: ixxxxx#5 nsp=1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=197: ixxxxx#5 nsp=1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=197: ixxxxx#5 nsp=1 mass=-500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=197: ixxxxx#5 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=197: ixxxxx#5 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=198: vxxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=198: vxxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=198: vxxxxx#5 nsp=1 mass=500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=198: vxxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=198: vxxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=198: vxxxxx#5 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=199: vxxxxx#5 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=199: vxxxxx#5 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=199: vxxxxx#5 nsp=1 mass=-500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=199: vxxxxx#5 nsp=1 mass=-500 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=199: vxxxxx#5 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=199: vxxxxx#5 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=200: sxxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=200: sxxxxx#5 nsp=1 mass=500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=200: sxxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=200: sxxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=200: sxxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=200: sxxxxx#5 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=201: sxxxxx#5 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=201: sxxxxx#5 nsp=1 mass=-500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=201: sxxxxx#5 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=201: sxxxxx#5 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=201: sxxxxx#5 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=201: sxxxxx#5 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=202: oxxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=202: oxxxxx#5 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=202: oxxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=202: oxxxxx#5 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=202: oxxxxx#5 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=202: oxxxxx#5 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=203: oxxxxx#5 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=203: oxxxxx#5 nsp=1 mass=-500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=203: oxxxxx#5 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=203: oxxxxx#5 nsp=1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=203: oxxxxx#5 nsp=1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=203: oxxxxx#5 nsp=1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=204: ixxxxx#6 nsp=1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=204: ixxxxx#6 nsp=1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=204: ixxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=204: ixxxxx#6 nsp=1 mass=400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=204: ixxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=204: ixxxxx#6 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=205: ixxxxx#6 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=205: ixxxxx#6 nsp=1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=205: ixxxxx#6 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=205: ixxxxx#6 nsp=1 mass=-400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=205: ixxxxx#6 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=205: ixxxxx#6 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=206: vxxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=206: vxxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=206: vxxxxx#6 nsp=1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=206: vxxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=206: vxxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=206: vxxxxx#6 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=207: vxxxxx#6 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=207: vxxxxx#6 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=207: vxxxxx#6 nsp=1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=207: vxxxxx#6 nsp=1 mass=-400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=207: vxxxxx#6 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=207: vxxxxx#6 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=208: sxxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=208: sxxxxx#6 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=208: sxxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=208: sxxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=208: sxxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=208: sxxxxx#6 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=209: sxxxxx#6 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=209: sxxxxx#6 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=209: sxxxxx#6 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=209: sxxxxx#6 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=209: sxxxxx#6 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=209: sxxxxx#6 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=210: oxxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=210: oxxxxx#6 nsp=1 mass=400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=210: oxxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=210: oxxxxx#6 nsp=1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=210: oxxxxx#6 nsp=1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=210: oxxxxx#6 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=211: oxxxxx#6 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=211: oxxxxx#6 nsp=1 mass=-400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=211: oxxxxx#6 nsp=1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=211: oxxxxx#6 nsp=1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=211: oxxxxx#6 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=211: oxxxxx#6 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=212: ixxxxx#7 nsp=1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=212: ixxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=212: ixxxxx#7 nsp=1 mass=400 + -1.414213562373095e+01, 0.000000000000000e+00, // itest=212: ixxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=212: ixxxxx#7 nsp=1 mass=400 + -2.828427124746190e+01, 0.000000000000000e+00 } ); // itest=212: ixxxxx#7 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=213: ixxxxx#7 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=213: ixxxxx#7 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=213: ixxxxx#7 nsp=1 mass=-400 + 1.414213562373095e+01, -0.000000000000000e+00, // itest=213: ixxxxx#7 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=213: ixxxxx#7 nsp=1 mass=-400 + -2.828427124746190e+01, 0.000000000000000e+00 } ); // itest=213: ixxxxx#7 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=214: vxxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=214: vxxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=214: vxxxxx#7 nsp=1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=214: vxxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=214: vxxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=214: vxxxxx#7 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=215: vxxxxx#7 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=215: vxxxxx#7 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=215: vxxxxx#7 nsp=1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=215: vxxxxx#7 nsp=1 mass=-400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=215: vxxxxx#7 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=215: vxxxxx#7 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=216: sxxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=216: sxxxxx#7 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=216: sxxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=216: sxxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=216: sxxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=216: sxxxxx#7 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=217: sxxxxx#7 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=217: sxxxxx#7 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=217: sxxxxx#7 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=217: sxxxxx#7 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=217: sxxxxx#7 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=217: sxxxxx#7 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=218: oxxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=218: oxxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=218: oxxxxx#7 nsp=1 mass=400 + -2.828427124746190e+01, 0.000000000000000e+00, // itest=218: oxxxxx#7 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=218: oxxxxx#7 nsp=1 mass=400 + -1.414213562373095e+01, 0.000000000000000e+00 } ); // itest=218: oxxxxx#7 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=219: oxxxxx#7 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=219: oxxxxx#7 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=219: oxxxxx#7 nsp=1 mass=-400 + -2.828427124746190e+01, 0.000000000000000e+00, // itest=219: oxxxxx#7 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=219: oxxxxx#7 nsp=1 mass=-400 + 1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=219: oxxxxx#7 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=220: ixxxxx#8 nsp=1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=220: ixxxxx#8 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=220: ixxxxx#8 nsp=1 mass=400 + 5.999999999999999e+00, 7.999999999999999e+00, // itest=220: ixxxxx#8 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=220: ixxxxx#8 nsp=1 mass=400 + 1.200000000000000e+01, 1.600000000000000e+01 } ); // itest=220: ixxxxx#8 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=221: ixxxxx#8 nsp=1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=221: ixxxxx#8 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=221: ixxxxx#8 nsp=1 mass=-400 + -5.999999999999999e+00, -7.999999999999999e+00, // itest=221: ixxxxx#8 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=221: ixxxxx#8 nsp=1 mass=-400 + 1.200000000000000e+01, 1.600000000000000e+01 } ); // itest=221: ixxxxx#8 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=222: vxxxxx#8 nsp=1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=222: vxxxxx#8 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=222: vxxxxx#8 nsp=1 mass=400 + 0.000000000000000e+00, -5.656854249492381e-01, // itest=222: vxxxxx#8 nsp=1 mass=400 + 0.000000000000000e+00, 4.242640687119285e-01, // itest=222: vxxxxx#8 nsp=1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=222: vxxxxx#8 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=223: vxxxxx#8 nsp=1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=223: vxxxxx#8 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=223: vxxxxx#8 nsp=1 mass=-400 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=223: vxxxxx#8 nsp=1 mass=-400 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=223: vxxxxx#8 nsp=1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=223: vxxxxx#8 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=224: sxxxxx#8 nsp=1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=224: sxxxxx#8 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=224: sxxxxx#8 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=224: sxxxxx#8 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=224: sxxxxx#8 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=224: sxxxxx#8 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=225: sxxxxx#8 nsp=1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=225: sxxxxx#8 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=225: sxxxxx#8 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=225: sxxxxx#8 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=225: sxxxxx#8 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=225: sxxxxx#8 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=226: oxxxxx#8 nsp=1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=226: oxxxxx#8 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=226: oxxxxx#8 nsp=1 mass=400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=226: oxxxxx#8 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=226: oxxxxx#8 nsp=1 mass=400 + 5.999999999999999e+00, -7.999999999999999e+00 } ); // itest=226: oxxxxx#8 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=227: oxxxxx#8 nsp=1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=227: oxxxxx#8 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=227: oxxxxx#8 nsp=1 mass=-400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=227: oxxxxx#8 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=227: oxxxxx#8 nsp=1 mass=-400 + -5.999999999999999e+00, 7.999999999999999e+00 } ); // itest=227: oxxxxx#8 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=228: ixxxxx#9 nsp=1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=228: ixxxxx#9 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=228: ixxxxx#9 nsp=1 mass=400 + -7.999999999999999e+00, -5.999999999999999e+00, // itest=228: ixxxxx#9 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=228: ixxxxx#9 nsp=1 mass=400 + -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=228: ixxxxx#9 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=229: ixxxxx#9 nsp=1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=229: ixxxxx#9 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=229: ixxxxx#9 nsp=1 mass=-400 + 7.999999999999999e+00, 5.999999999999999e+00, // itest=229: ixxxxx#9 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=229: ixxxxx#9 nsp=1 mass=-400 + -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=229: ixxxxx#9 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=230: vxxxxx#9 nsp=1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=230: vxxxxx#9 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=230: vxxxxx#9 nsp=1 mass=400 + 0.000000000000000e+00, 4.242640687119285e-01, // itest=230: vxxxxx#9 nsp=1 mass=400 + 0.000000000000000e+00, -5.656854249492381e-01, // itest=230: vxxxxx#9 nsp=1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=230: vxxxxx#9 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=231: vxxxxx#9 nsp=1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=231: vxxxxx#9 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=231: vxxxxx#9 nsp=1 mass=-400 + 0.000000000000000e+00, 4.242640687119285e-01, // itest=231: vxxxxx#9 nsp=1 mass=-400 + 0.000000000000000e+00, -5.656854249492381e-01, // itest=231: vxxxxx#9 nsp=1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=231: vxxxxx#9 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=232: sxxxxx#9 nsp=1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=232: sxxxxx#9 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=232: sxxxxx#9 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=232: sxxxxx#9 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=232: sxxxxx#9 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=232: sxxxxx#9 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=233: sxxxxx#9 nsp=1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=233: sxxxxx#9 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=233: sxxxxx#9 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=233: sxxxxx#9 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=233: sxxxxx#9 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=233: sxxxxx#9 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=234: oxxxxx#9 nsp=1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=234: oxxxxx#9 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=234: oxxxxx#9 nsp=1 mass=400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=234: oxxxxx#9 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=234: oxxxxx#9 nsp=1 mass=400 + -7.999999999999999e+00, 5.999999999999999e+00 } ); // itest=234: oxxxxx#9 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=235: oxxxxx#9 nsp=1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=235: oxxxxx#9 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=235: oxxxxx#9 nsp=1 mass=-400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=235: oxxxxx#9 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=235: oxxxxx#9 nsp=1 mass=-400 + 7.999999999999999e+00, -5.999999999999999e+00 } ); // itest=235: oxxxxx#9 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=236: ixxxxx#10 nsp=1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=236: ixxxxx#10 nsp=1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00, // itest=236: ixxxxx#10 nsp=1 mass=400 + 4.931969619160719e+00, 5.260767593771432e+00, // itest=236: ixxxxx#10 nsp=1 mass=400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=236: ixxxxx#10 nsp=1 mass=400 + 9.863939238321439e+00, 1.052153518754287e+01 } ); // itest=236: ixxxxx#10 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=237: ixxxxx#10 nsp=1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=237: ixxxxx#10 nsp=1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00, // itest=237: ixxxxx#10 nsp=1 mass=-400 + -4.931969619160719e+00, -5.260767593771432e+00, // itest=237: ixxxxx#10 nsp=1 mass=-400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=237: ixxxxx#10 nsp=1 mass=-400 + 9.863939238321439e+00, 1.052153518754287e+01 } ); // itest=237: ixxxxx#10 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=238: vxxxxx#10 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=238: vxxxxx#10 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=238: vxxxxx#10 nsp=1 mass=400 + -2.321373168788980e-01, -5.158607041753289e-01, // itest=238: vxxxxx#10 nsp=1 mass=400 + -2.476131380041579e-01, 4.836194101643708e-01, // itest=238: vxxxxx#10 nsp=1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=238: vxxxxx#10 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=239: vxxxxx#10 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=239: vxxxxx#10 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=239: vxxxxx#10 nsp=1 mass=-400 + -2.321373168788980e-01, -5.158607041753289e-01, // itest=239: vxxxxx#10 nsp=1 mass=-400 + -2.476131380041579e-01, 4.836194101643708e-01, // itest=239: vxxxxx#10 nsp=1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=239: vxxxxx#10 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=240: sxxxxx#10 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=240: sxxxxx#10 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=240: sxxxxx#10 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=240: sxxxxx#10 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=240: sxxxxx#10 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=240: sxxxxx#10 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=241: sxxxxx#10 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=241: sxxxxx#10 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=241: sxxxxx#10 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=241: sxxxxx#10 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=241: sxxxxx#10 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=241: sxxxxx#10 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=242: oxxxxx#10 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=242: oxxxxx#10 nsp=1 mass=400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=242: oxxxxx#10 nsp=1 mass=400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=242: oxxxxx#10 nsp=1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00, // itest=242: oxxxxx#10 nsp=1 mass=400 + 4.931969619160719e+00, -5.260767593771432e+00 } ); // itest=242: oxxxxx#10 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=243: oxxxxx#10 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=243: oxxxxx#10 nsp=1 mass=-400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=243: oxxxxx#10 nsp=1 mass=-400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=243: oxxxxx#10 nsp=1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00, // itest=243: oxxxxx#10 nsp=1 mass=-400 + -4.931969619160719e+00, 5.260767593771432e+00 } ); // itest=243: oxxxxx#10 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=244: ixxxxx#11 nsp=1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=244: ixxxxx#11 nsp=1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00, // itest=244: ixxxxx#11 nsp=1 mass=400 + 8.320502943378436e+00, 8.875203139603666e+00, // itest=244: ixxxxx#11 nsp=1 mass=400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=244: ixxxxx#11 nsp=1 mass=400 + 1.664100588675688e+01, 1.775040627920733e+01 } ); // itest=244: ixxxxx#11 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=245: ixxxxx#11 nsp=1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=245: ixxxxx#11 nsp=1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00, // itest=245: ixxxxx#11 nsp=1 mass=-400 + -8.320502943378436e+00, -8.875203139603666e+00, // itest=245: ixxxxx#11 nsp=1 mass=-400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=245: ixxxxx#11 nsp=1 mass=-400 + 1.664100588675688e+01, 1.775040627920733e+01 } ); // itest=245: ixxxxx#11 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=246: vxxxxx#11 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=246: vxxxxx#11 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=246: vxxxxx#11 nsp=1 mass=400 + 2.321373168788980e-01, -5.158607041753289e-01, // itest=246: vxxxxx#11 nsp=1 mass=400 + 2.476131380041579e-01, 4.836194101643708e-01, // itest=246: vxxxxx#11 nsp=1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=246: vxxxxx#11 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=247: vxxxxx#11 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=247: vxxxxx#11 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=247: vxxxxx#11 nsp=1 mass=-400 + 2.321373168788980e-01, -5.158607041753289e-01, // itest=247: vxxxxx#11 nsp=1 mass=-400 + 2.476131380041579e-01, 4.836194101643708e-01, // itest=247: vxxxxx#11 nsp=1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=247: vxxxxx#11 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=248: sxxxxx#11 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=248: sxxxxx#11 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=248: sxxxxx#11 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=248: sxxxxx#11 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=248: sxxxxx#11 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=248: sxxxxx#11 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=249: sxxxxx#11 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=249: sxxxxx#11 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=249: sxxxxx#11 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=249: sxxxxx#11 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=249: sxxxxx#11 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=249: sxxxxx#11 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=250: oxxxxx#11 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=250: oxxxxx#11 nsp=1 mass=400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=250: oxxxxx#11 nsp=1 mass=400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=250: oxxxxx#11 nsp=1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00, // itest=250: oxxxxx#11 nsp=1 mass=400 + 8.320502943378436e+00, -8.875203139603666e+00 } ); // itest=250: oxxxxx#11 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=251: oxxxxx#11 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=251: oxxxxx#11 nsp=1 mass=-400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=251: oxxxxx#11 nsp=1 mass=-400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=251: oxxxxx#11 nsp=1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00, // itest=251: oxxxxx#11 nsp=1 mass=-400 + -8.320502943378436e+00, 8.875203139603666e+00 } ); // itest=251: oxxxxx#11 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=252: ixxxxx#12 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=252: ixxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=252: ixxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=252: ixxxxx#12 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=252: ixxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=252: ixxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=253: ixxxxx#12 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=253: ixxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=253: ixxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=253: ixxxxx#12 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=253: ixxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=253: ixxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=254: ipzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=254: ipzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=254: ipzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=254: ipzxxx#12 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=254: ipzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=254: ipzxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=255: vxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=255: vxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=255: vxxxxx#12 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=255: vxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=255: vxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=255: vxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=256: vxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=256: vxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=256: vxxxxx#12 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=256: vxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=256: vxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=256: vxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=257: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=257: sxxxxx#12 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=257: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=257: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=257: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=257: sxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=258: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=258: sxxxxx#12 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=258: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=258: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=258: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=258: sxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=259: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=259: oxxxxx#12 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=259: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=259: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=259: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=259: oxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=260: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=260: oxxxxx#12 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=260: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=260: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=260: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=260: oxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=261: opzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=261: opzxxx#12 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=261: opzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=261: opzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=261: opzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=261: opzxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=262: ixxxxx#13 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=262: ixxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=262: ixxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=262: ixxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=262: ixxxxx#13 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=262: ixxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=263: ixxxxx#13 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=263: ixxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=263: ixxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=263: ixxxxx#13 nsp=1 mass=0 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=263: ixxxxx#13 nsp=1 mass=0 + -3.162277660168379e+01, -0.000000000000000e+00 } ); // itest=263: ixxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=264: imzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=264: imzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=264: imzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=264: imzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=264: imzxxx#13 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=264: imzxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=265: vxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=265: vxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=265: vxxxxx#13 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=265: vxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=265: vxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=265: vxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=266: vxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=266: vxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=266: vxxxxx#13 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=266: vxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=266: vxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=266: vxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=267: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=267: sxxxxx#13 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=267: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=267: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=267: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=267: sxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=268: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=268: sxxxxx#13 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=268: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=268: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=268: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=268: sxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=269: oxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=269: oxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=269: oxxxxx#13 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=269: oxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=269: oxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=269: oxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=270: oxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=270: oxxxxx#13 nsp=1 mass=0 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=270: oxxxxx#13 nsp=1 mass=0 + -3.162277660168379e+01, -0.000000000000000e+00, // itest=270: oxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=270: oxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=270: oxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=271: omzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=271: omzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=271: omzxxx#13 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=271: omzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=271: omzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=271: omzxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=272: ixxxxx#14 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=272: ixxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=272: ixxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=272: ixxxxx#14 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=272: ixxxxx#14 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=272: ixxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=273: ixxxxx#14 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=273: ixxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=273: ixxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=273: ixxxxx#14 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=273: ixxxxx#14 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=273: ixxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=274: ixzxxx#14 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=274: ixzxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=274: ixzxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=274: ixzxxx#14 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=274: ixzxxx#14 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=274: ixzxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=275: vxxxxx#14 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=275: vxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=275: vxxxxx#14 nsp=1 mass=0 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=275: vxxxxx#14 nsp=1 mass=0 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=275: vxxxxx#14 nsp=1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=275: vxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=276: vxxxxx#14 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=276: vxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=276: vxxxxx#14 nsp=1 mass=0 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=276: vxxxxx#14 nsp=1 mass=0 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=276: vxxxxx#14 nsp=1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=276: vxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=277: sxxxxx#14 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=277: sxxxxx#14 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=277: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=277: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=277: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=277: sxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=278: sxxxxx#14 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=278: sxxxxx#14 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=278: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=278: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=278: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=278: sxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=279: oxxxxx#14 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=279: oxxxxx#14 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=279: oxxxxx#14 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=279: oxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=279: oxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=279: oxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=280: oxxxxx#14 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=280: oxxxxx#14 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=280: oxxxxx#14 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=280: oxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=280: oxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=280: oxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=281: oxzxxx#14 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=281: oxzxxx#14 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=281: oxzxxx#14 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=281: oxzxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=281: oxzxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=281: oxzxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=282: ixxxxx#15 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=282: ixxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=282: ixxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=282: ixxxxx#15 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=282: ixxxxx#15 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=282: ixxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=283: ixxxxx#15 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=283: ixxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=283: ixxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=283: ixxxxx#15 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=283: ixxxxx#15 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=283: ixxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=284: ixzxxx#15 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=284: ixzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=284: ixzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=284: ixzxxx#15 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=284: ixzxxx#15 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=284: ixzxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=285: vxxxxx#15 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=285: vxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=285: vxxxxx#15 nsp=1 mass=0 + -3.394112549695428e-01, -5.656854249492381e-01, // itest=285: vxxxxx#15 nsp=1 mass=0 + -4.525483399593904e-01, 4.242640687119285e-01, // itest=285: vxxxxx#15 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=285: vxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=286: vxxxxx#15 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=286: vxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=286: vxxxxx#15 nsp=1 mass=0 + -3.394112549695428e-01, -5.656854249492381e-01, // itest=286: vxxxxx#15 nsp=1 mass=0 + -4.525483399593904e-01, 4.242640687119285e-01, // itest=286: vxxxxx#15 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=286: vxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=287: sxxxxx#15 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=287: sxxxxx#15 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=287: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=287: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=287: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=287: sxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=288: sxxxxx#15 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=288: sxxxxx#15 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=288: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=288: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=288: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=288: sxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=289: oxxxxx#15 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=289: oxxxxx#15 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=289: oxxxxx#15 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=289: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=289: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=289: oxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=290: oxxxxx#15 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=290: oxxxxx#15 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=290: oxxxxx#15 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=290: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=290: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=290: oxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=291: oxzxxx#15 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=291: oxzxxx#15 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=291: oxzxxx#15 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=291: oxzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=291: oxzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=291: oxzxxx#15 nsp=1 mass=0 diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timer.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timer.h new file mode 100644 index 0000000000..14d7a4d892 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timer.h @@ -0,0 +1,67 @@ +#ifndef MGONGPUTIMER_H +#define MGONGPUTIMER_H 1 + +#include +#include + +namespace mgOnGpu +{ + + /* + high_resolution_clock + steady_clock + system_clock + + from https://www.modernescpp.com/index.php/the-three-clocks + and https://codereview.stackexchange.com/questions/196245/extremely-simple-timer-class-in-c + */ + + template + class Timer + { + public: + Timer() + : m_StartTime( T::now() ) {} + virtual ~Timer() {} + void Start(); + float GetDuration(); + void Info(); + private: + typedef typename T::time_point TTP; + TTP m_StartTime; + }; + + template + void + Timer::Start() + { + m_StartTime = T::now(); + } + + template + float + Timer::GetDuration() + { + std::chrono::duration duration = T::now() - m_StartTime; + return duration.count(); + } + + template + void + Timer::Info() + { + typedef typename T::period TPER; + typedef typename std::ratio_multiply MilliSec; + typedef typename std::ratio_multiply MicroSec; + std::cout << std::boolalpha << std::endl; + std::cout << "clock info: " << std::endl; + std::cout << " is steady: " << T::is_steady << std::endl; + std::cout << " precision: " << TPER::num << "/" << TPER::den << " second " << std::endl; + std::cout << std::fixed; + std::cout << " " << static_cast( MilliSec::num ) / MilliSec::den << " milliseconds " << std::endl; + std::cout << " " << static_cast( MicroSec::num ) / MicroSec::den << " microseconds " << std::endl; + std::cout << std::endl; + } + +} +#endif // MGONGPUTIMER_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timermap.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timermap.h new file mode 100644 index 0000000000..60d8c51021 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timermap.h @@ -0,0 +1,156 @@ +#ifndef MGONGPUTIMERMAP_H +#define MGONGPUTIMERMAP_H 1 + +#include +#include +#include +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmissing-field-initializers" +#include "nvtx.h" +#pragma GCC diagnostic pop + +#include "timer.h" +#define TIMERTYPE std::chrono::high_resolution_clock + +namespace mgOnGpu +{ + class TimerMap + { + + public: + + TimerMap() + : m_timer(), m_active( "" ), m_partitionTimers(), m_partitionIds() {} + virtual ~TimerMap() {} + + // Start the timer for a specific partition (key must be a non-empty string) + // Stop the timer for the current partition if there is one active + float start( const std::string& key ) + { + assert( key != "" ); + // Close the previously active partition + float last = stop(); + // Switch to a new partition + m_timer.Start(); + m_active = key; + if( m_partitionTimers.find( key ) == m_partitionTimers.end() ) + { + m_partitionIds[key] = m_partitionTimers.size(); + m_partitionTimers[key] = 0; + } + // Open a new Cuda NVTX range + NVTX_PUSH( key.c_str(), m_partitionIds[key] ); + // Return last duration + return last; + } + + // Stop the timer for the current partition if there is one active + float stop() + { + // Close the previously active partition + float last = 0; + if( m_active != "" ) + { + last = m_timer.GetDuration(); + m_partitionTimers[m_active] += last; + } + m_active = ""; + // Close the current Cuda NVTX range + NVTX_POP(); + // Return last duration + return last; + } + + // Dump the overall results + void dump( std::ostream& ostr = std::cout, bool json = false ) + { + // Improve key formatting + const std::string totalKey = "TOTAL "; // "TOTAL (ANY)"? + //const std::string totalBut2Key = "TOTAL (n-2)"; + const std::string total123Key = "TOTAL (123)"; + const std::string total23Key = "TOTAL (23)"; + const std::string total1Key = "TOTAL (1)"; + const std::string total2Key = "TOTAL (2)"; + const std::string total3Key = "TOTAL (3)"; + const std::string total3aKey = "TOTAL (3a)"; + size_t maxsize = 0; + for( auto ip: m_partitionTimers ) + maxsize = std::max( maxsize, ip.first.size() ); + maxsize = std::max( maxsize, totalKey.size() ); + // Compute the overall total + //size_t ipart = 0; + float total = 0; + //float totalBut2 = 0; + float total123 = 0; + float total23 = 0; + float total1 = 0; + float total2 = 0; + float total3 = 0; + float total3a = 0; + for( auto ip: m_partitionTimers ) + { + total += ip.second; + //if ( ipart != 0 && ipart+1 != m_partitionTimers.size() ) totalBut2 += ip.second; + if( ip.first[0] == '1' || ip.first[0] == '2' || ip.first[0] == '3' ) total123 += ip.second; + if( ip.first[0] == '2' || ip.first[0] == '3' ) total23 += ip.second; + if( ip.first[0] == '1' ) total1 += ip.second; + if( ip.first[0] == '2' ) total2 += ip.second; + if( ip.first[0] == '3' ) total3 += ip.second; + if( ip.first[0] == '3' && ip.first[1] == 'a' ) total3a += ip.second; + //ipart++; + } + // Dump individual partition timers and the overall total + if( json ) + { + std::string s1 = "\"", s2 = "\" : \"", s3 = " sec\","; + ostr << std::setprecision( 6 ); // set precision (default=6): affects all floats + ostr << std::fixed; // fixed format: affects all floats + for( auto ip: m_partitionTimers ) + ostr << s1 << ip.first << s2 << ip.second << s3 << std::endl; + ostr << s1 << totalKey << s2 << total << s3 << std::endl + << s1 << total123Key << s2 << total123 << s3 << std::endl + << s1 << total23Key << s2 << total23 << s3 << std::endl + << s1 << total3Key << s2 << total3 << s3 << std::endl + << s1 << total3aKey << s2 << total3a << " sec \"" << std::endl; + ostr << std::defaultfloat; // default format: affects all floats + } + else + { + // NB: 'setw' affects only the next field (of any type) + ostr << std::setprecision( 6 ); // set precision (default=6): affects all floats + ostr << std::fixed; // fixed format: affects all floats + for( auto ip: m_partitionTimers ) + ostr << std::setw( maxsize ) << ip.first << " : " + << std::setw( 12 ) << ip.second << " sec" << std::endl; + ostr << std::setw( maxsize ) << totalKey << " : " + << std::setw( 12 ) << total << " sec" << std::endl + << std::setw( maxsize ) << total123Key << " : " + << std::setw( 12 ) << total123 << " sec" << std::endl + << std::setw( maxsize ) << total23Key << " : " + << std::setw( 12 ) << total23 << " sec" << std::endl + << std::setw( maxsize ) << total1Key << " : " + << std::setw( 12 ) << total1 << " sec" << std::endl + << std::setw( maxsize ) << total2Key << " : " + << std::setw( 12 ) << total2 << " sec" << std::endl + << std::setw( maxsize ) << total3Key << " : " + << std::setw( 12 ) << total3 << " sec" << std::endl + << std::setw( maxsize ) << total3aKey << " : " + << std::setw( 12 ) << total3a << " sec" << std::endl; + ostr << std::defaultfloat; // default format: affects all floats + } + } + + private: + + Timer m_timer; + std::string m_active; + std::map m_partitionTimers; + std::map m_partitionIds; + }; + +} + +#endif // MGONGPUTIMERMAP_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/CMakeLists.txt b/epochX/cudacpp/susy_gg_tt.sa/src/CMakeLists.txt new file mode 100644 index 0000000000..bb6d5ee85d --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/CMakeLists.txt @@ -0,0 +1,5 @@ +file(GLOB_RECURSE HEADERS "*.h") +add_library(mg5amc_common Parameters_sm.cc read_slha.cc ${HEADERS}) + +# some XCode specific stuff to make the executable run +set_property(TARGET mg5amc_common PROPERTY XCODE_GENERATE_SCHEME TRUE) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h new file mode 100644 index 0000000000..7574e7f445 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h @@ -0,0 +1,963 @@ +//========================================================================== +// This file has been automatically generated for CUDA/C++ standalone by +// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26 +// By the MadGraph5_aMC@NLO Development Team +// Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch +//========================================================================== + +#ifndef HelAmps_MSSM_SLHA2_H +#define HelAmps_MSSM_SLHA2_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuVectors.h" + +#include "Parameters_MSSM_SLHA2.h" + +//#include +//#include +//#include +//#include + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + +#ifdef MGONGPU_INLINE_HELAMPS +#define INLINE inline +#define ALWAYS_INLINE __attribute__( ( always_inline ) ) +#else +#define INLINE +#define ALWAYS_INLINE +#endif + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt] + template + __host__ __device__ INLINE void + ixxxxx( const fptype momenta[], // input: momenta + const fptype fmass, // input: fermion mass + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar // input: particle# out of npar + ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt] + // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0) + template + __host__ __device__ INLINE void + ipzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar // input: particle# out of npar + ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt] + // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0) + template + __host__ __device__ INLINE void + imzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar // input: particle# out of npar + ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt] + // ASSUMPTIONS: (FMASS == 0) and (PT > 0) + template + __host__ __device__ INLINE void + ixzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar // input: particle# out of npar + ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction vc[6] from the input momenta[npar*4*nevt] + template + __host__ __device__ INLINE void + vxxxxx( const fptype momenta[], // input: momenta + const fptype vmass, // input: vector boson mass + const int nhel, // input: -1, 0 (only if vmass!=0) or +1 (helicity of vector boson) + const int nsv, // input: +1 (final) or -1 (initial) + fptype wavefunctions[], // output: wavefunctions + const int ipar // input: particle# out of npar + ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction sc[3] from the input momenta[npar*4*nevt] + template + __host__ __device__ INLINE void + sxxxxx( const fptype momenta[], // input: momenta + //const fptype, // WARNING: input "smass" unused (missing in Fortran) - scalar boson mass + //const int, // WARNING: input "nhel" unused (missing in Fortran) - scalar has no helicity! + const int nss, // input: +1 (final) or -1 (initial) + fptype wavefunctions[], // output: wavefunctions + const int ipar // input: particle# out of npar + ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt] + template + __host__ __device__ INLINE void + oxxxxx( const fptype momenta[], // input: momenta + const fptype fmass, // input: fermion mass + const int nhel, // input: -1, 0 (only if vmass!=0) or +1 (helicity of vector boson) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar // input: particle# out of npar + ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt] + // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0) + template + __host__ __device__ INLINE void + opzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar // input: particle# out of npar + ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt] + // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0) + template + __host__ __device__ INLINE void + omzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar // input: particle# out of npar + ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt] + template + __host__ __device__ INLINE void + oxzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar // input: particle# out of npar + ) ALWAYS_INLINE; + + //========================================================================== + + // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt] + template + __host__ __device__ void + ixxxxx( const fptype momenta[], // input: momenta + const fptype fmass, // input: fermion mass + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar ) // input: particle# out of npar + { + mgDebug( 0, __FUNCTION__ ); + const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); + const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); + const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); + const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar ); + cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions ); + fi[0] = cxmake( -pvec0 * (fptype)nsf, -pvec3 * (fptype)nsf ); + fi[1] = cxmake( -pvec1 * (fptype)nsf, -pvec2 * (fptype)nsf ); + const int nh = nhel * nsf; + if( fmass != 0. ) + { + const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#ifndef MGONGPU_CPPSIMD + if( pp == 0. ) + { + // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! + fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0. }; // possibility of negative fermion masses + //sqm[1] = ( fmass < 0. ? -abs( sqm[0] ) : abs( sqm[0] ) ); // AV: why abs here? + sqm[1] = ( fmass < 0. ? -sqm[0] : sqm[0] ); // AV: removed an abs here + const int ip = ( 1 + nh ) / 2; // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++ + const int im = ( 1 - nh ) / 2; // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++ + fi[2] = cxmake( ip * sqm[ip], 0 ); + fi[3] = cxmake( im * nsf * sqm[ip], 0 ); + fi[4] = cxmake( ip * nsf * sqm[im], 0 ); + fi[5] = cxmake( im * sqm[im], 0 ); + } + else + { + const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5, + fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 }; + fptype omega[2] = { fpsqrt( pvec0 + pp ), 0. }; + omega[1] = fmass / omega[0]; + const int ip = ( 1 + nh ) / 2; // NB: Fortran is (3+nh)/2 because omega(2) has indexes 1,2 and not 0,1 + const int im = ( 1 - nh ) / 2; // NB: Fortran is (3-nh)/2 because omega(2) has indexes 1,2 and not 0,1 + const fptype sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; + const fptype pp3 = fpmax( pp + pvec3, 0. ); + const cxtype chi[2] = { cxmake( fpsqrt( pp3 * (fptype)0.5 / pp ), 0. ), + ( pp3 == 0. ? cxmake( -nh, 0. ) : cxmake( nh * pvec1, pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) }; + fi[2] = sfomega[0] * chi[im]; + fi[3] = sfomega[0] * chi[ip]; + fi[4] = sfomega[1] * chi[im]; + fi[5] = sfomega[1] * chi[ip]; + } +#else + const int ip = ( 1 + nh ) / 2; + const int im = ( 1 - nh ) / 2; + // Branch A: pp == 0. + // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! + fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses (NB: SCALAR!) + sqm[1] = ( fmass < 0 ? -sqm[0] : sqm[0] ); // AV: removed an abs here (as above) + const cxtype fiA_2 = ip * sqm[ip]; // scalar cxtype: real part initialised from fptype, imag part = 0 + const cxtype fiA_3 = im * nsf * sqm[ip]; // scalar cxtype: real part initialised from fptype, imag part = 0 + const cxtype fiA_4 = ip * nsf * sqm[im]; // scalar cxtype: real part initialised from fptype, imag part = 0 + const cxtype fiA_5 = im * sqm[im]; // scalar cxtype: real part initialised from fptype, imag part = 0 + // Branch B: pp != 0. + const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5, + fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 }; + fptype_v omega[2] = { fpsqrt( pvec0 + pp ), 0 }; + omega[1] = fmass / omega[0]; + const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; + const fptype_v pp3 = fpmax( pp + pvec3, 0 ); + const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / pp ), 0 ), + cxternary( ( pp3 == 0. ), + cxmake( -nh, 0 ), + cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) }; + const cxtype_v fiB_2 = sfomega[0] * chi[im]; + const cxtype_v fiB_3 = sfomega[0] * chi[ip]; + const cxtype_v fiB_4 = sfomega[1] * chi[im]; + const cxtype_v fiB_5 = sfomega[1] * chi[ip]; + // Choose between the results from branch A and branch B + const bool_v mask = ( pp == 0. ); + fi[2] = cxternary( mask, fiA_2, fiB_2 ); + fi[3] = cxternary( mask, fiA_3, fiB_3 ); + fi[4] = cxternary( mask, fiA_4, fiB_4 ); + fi[5] = cxternary( mask, fiA_5, fiB_5 ); +#endif + } + else + { + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); + const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; + if( nh == 1 ) + { + fi[2] = cxzero_sv(); + fi[3] = cxzero_sv(); + fi[4] = chi[0]; + fi[5] = chi[1]; + } + else + { + fi[2] = chi[1]; + fi[3] = chi[0]; + fi[4] = cxzero_sv(); + fi[5] = cxzero_sv(); + } + } + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt] + // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0) + template + __host__ __device__ void + ipzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar ) // input: particle# out of npar + { + mgDebug( 0, __FUNCTION__ ); + const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar ); + cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions ); + fi[0] = cxmake( -pvec3 * (fptype)nsf, -pvec3 * (fptype)nsf ); + fi[1] = cxzero_sv(); + const int nh = nhel * nsf; + const cxtype_sv sqp0p3 = cxmake( fpsqrt( 2. * pvec3 ) * (fptype)nsf, 0. ); + fi[2] = fi[1]; + if( nh == 1 ) + { + fi[3] = fi[1]; + fi[4] = sqp0p3; + } + else + { + fi[3] = sqp0p3; + fi[4] = fi[1]; + } + fi[5] = fi[1]; + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt] + // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0) + template + __host__ __device__ void + imzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar ) // input: particle# out of npar + { + mgDebug( 0, __FUNCTION__ ); + const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar ); + cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions ); + fi[0] = cxmake( pvec3 * (fptype)nsf, -pvec3 * (fptype)nsf ); + fi[1] = cxzero_sv(); + const int nh = nhel * nsf; + const cxtype_sv chi = cxmake( -(fptype)nhel * fpsqrt( -2. * pvec3 ), 0. ); + fi[3] = cxzero_sv(); + fi[4] = cxzero_sv(); + if( nh == 1 ) + { + fi[2] = cxzero_sv(); + fi[5] = chi; + } + else + { + fi[2] = chi; + fi[5] = cxzero_sv(); + } + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fi[6] from the input momenta[npar*4*nevt] + // ASSUMPTIONS: (FMASS == 0) and (PT > 0) + template + __host__ __device__ void + ixzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar ) // input: particle# out of npar + { + mgDebug( 0, __FUNCTION__ ); + const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); + const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); + const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); + const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar ); + cxtype_sv* fi = W_ACCESS::kernelAccess( wavefunctions ); + //fi[0] = cxmake( -pvec0 * nsf, -pvec2 * nsf ); // AV: BUG! not the same as ixxxxx + //fi[1] = cxmake( -pvec0 * nsf, -pvec1 * nsf ); // AV: BUG! not the same as ixxxxx + fi[0] = cxmake( -pvec0 * (fptype)nsf, -pvec3 * (fptype)nsf ); // AV: BUG FIX + fi[1] = cxmake( -pvec1 * (fptype)nsf, -pvec2 * (fptype)nsf ); // AV: BUG FIX + const int nh = nhel * nsf; + //const float sqp0p3 = sqrtf( pvec0 + pvec3 ) * nsf; // AV: why force a float here? + const fptype_sv sqp0p3 = fpsqrt( pvec0 + pvec3 ) * (fptype)nsf; + const cxtype_sv chi0 = cxmake( sqp0p3, 0. ); + const cxtype_sv chi1 = cxmake( (fptype)nh * pvec1 / sqp0p3, pvec2 / sqp0p3 ); + if( nh == 1 ) + { + fi[2] = cxzero_sv(); + fi[3] = cxzero_sv(); + fi[4] = chi0; + fi[5] = chi1; + } + else + { + fi[2] = chi1; + fi[3] = chi0; + fi[4] = cxzero_sv(); + fi[5] = cxzero_sv(); + } + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction vc[6] from the input momenta[npar*4*nevt] + template + __host__ __device__ void + vxxxxx( const fptype momenta[], // input: momenta + const fptype vmass, // input: vector boson mass + const int nhel, // input: -1, 0 (only if vmass!=0) or +1 (helicity of vector boson) + const int nsv, // input: +1 (final) or -1 (initial) + fptype wavefunctions[], // output: wavefunctions + const int ipar ) // input: particle# out of npar + { + mgDebug( 0, __FUNCTION__ ); + const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); + const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); + const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); + const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar ); + cxtype_sv* vc = W_ACCESS::kernelAccess( wavefunctions ); + const fptype sqh = fpsqrt( 0.5 ); // AV this is > 0! + const fptype hel = nhel; + vc[0] = cxmake( pvec0 * (fptype)nsv, pvec3 * (fptype)nsv ); + vc[1] = cxmake( pvec1 * (fptype)nsv, pvec2 * (fptype)nsv ); + if( vmass != 0. ) + { + const int nsvahl = nsv * std::abs( hel ); + const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD + if( pp == 0. ) + { + vc[2] = cxmake( 0., 0. ); + vc[3] = cxmake( -hel * sqh, 0. ); + vc[4] = cxmake( 0., nsvahl * sqh ); + vc[5] = cxmake( hel0, 0. ); + } + else + { + const fptype emp = pvec0 / ( vmass * pp ); + vc[2] = cxmake( hel0 * pp / vmass, 0. ); + vc[5] = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0. ); + if( pt != 0. ) + { + const fptype pzpt = pvec3 / ( pp * pt ) * sqh * hel; + vc[3] = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -nsvahl * pvec2 / pt * sqh ); + vc[4] = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, nsvahl * pvec1 / pt * sqh ); + } + else + { + vc[3] = cxmake( -hel * sqh, 0. ); + // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! + //vc[4] = cxmake( 0., nsvahl * ( pvec3 < 0. ? -std::abs( sqh ) : std::abs( sqh ) ) ); // AV: why abs here? + vc[4] = cxmake( 0., nsvahl * ( pvec3 < 0. ? -sqh : sqh ) ); // AV: removed an abs here + } + } +#else + // Branch A: pp == 0. + const cxtype vcA_2 = cxmake( 0, 0 ); + const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); + const cxtype vcA_4 = cxmake( 0, nsvahl * sqh ); + const cxtype vcA_5 = cxmake( hel0, 0 ); + // Branch B: pp != 0. + const fptype_v emp = pvec0 / ( vmass * pp ); + const cxtype_v vcB_2 = cxmake( hel0 * pp / vmass, 0 ); + const cxtype_v vcB_5 = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0 ); + // Branch B1: pp != 0. and pt != 0. + const fptype_v pzpt = pvec3 / ( pp * pt ) * sqh * hel; + const cxtype_v vcB1_3 = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -(fptype)nsvahl * pvec2 / pt * sqh ); + const cxtype_v vcB1_4 = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, (fptype)nsvahl * pvec1 / pt * sqh ); + // Branch B2: pp != 0. and pt == 0. + const cxtype vcB2_3 = cxmake( -hel * sqh, 0. ); + const cxtype_v vcB2_4 = cxmake( 0., (fptype)nsvahl * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here + // Choose between the results from branch A and branch B (and from branch B1 and branch B2) + const bool_v mask = ( pp == 0. ); + const bool_v maskB = ( pt != 0. ); + vc[2] = cxternary( mask, vcA_2, vcB_2 ); + vc[3] = cxternary( mask, vcA_3, cxternary( maskB, vcB1_3, vcB2_3 ) ); + vc[4] = cxternary( mask, vcA_4, cxternary( maskB, vcB1_4, vcB2_4 ) ); + vc[5] = cxternary( mask, vcA_5, vcB_5 ); +#endif + } + else + { + const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 + const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); + vc[2] = cxzero_sv(); + vc[5] = cxmake( hel * pt / pp * sqh, 0. ); +#ifndef MGONGPU_CPPSIMD + if( pt != 0. ) + { + const fptype pzpt = pvec3 / ( pp * pt ) * sqh * hel; + vc[3] = cxmake( -pvec1 * pzpt, -nsv * pvec2 / pt * sqh ); + vc[4] = cxmake( -pvec2 * pzpt, nsv * pvec1 / pt * sqh ); + } + else + { + vc[3] = cxmake( -hel * sqh, 0. ); + // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! + //vc[4] = cxmake( 0, nsv * ( pvec3 < 0. ? -std::abs( sqh ) : std::abs( sqh ) ) ); // AV why abs here? + vc[4] = cxmake( 0., nsv * ( pvec3 < 0. ? -sqh : sqh ) ); // AV: removed an abs here + } +#else + // Branch A: pt != 0. + const fptype_v pzpt = pvec3 / ( pp * pt ) * sqh * hel; + const cxtype_v vcA_3 = cxmake( -pvec1 * pzpt, -(fptype)nsv * pvec2 / pt * sqh ); + const cxtype_v vcA_4 = cxmake( -pvec2 * pzpt, (fptype)nsv * pvec1 / pt * sqh ); + // Branch B: pt == 0. + const cxtype vcB_3 = cxmake( -(fptype)hel * sqh, 0 ); + const cxtype_v vcB_4 = cxmake( 0, (fptype)nsv * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here + // Choose between the results from branch A and branch B + const bool_v mask = ( pt != 0. ); + vc[3] = cxternary( mask, vcA_3, vcB_3 ); + vc[4] = cxternary( mask, vcA_4, vcB_4 ); +#endif + } + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction sc[3] from the input momenta[npar*4*nevt] + template + __host__ __device__ void + sxxxxx( const fptype momenta[], // input: momenta + //const fptype, // WARNING: input "smass" unused (missing in Fortran) - scalar boson mass + //const int, // WARNING: input "nhel" unused (missing in Fortran) - scalar has no helicity! + const int nss, // input: +1 (final) or -1 (initial) + fptype wavefunctions[], // output: wavefunctions + const int ipar ) // input: particle# out of npar + { + mgDebug( 0, __FUNCTION__ ); + const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); + const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); + const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); + const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar ); + cxtype_sv* sc = W_ACCESS::kernelAccess( wavefunctions ); + sc[2] = cxmake( 1 + fptype_sv{ 0 }, 0 ); + sc[0] = cxmake( pvec0 * (fptype)nss, pvec3 * (fptype)nss ); + sc[1] = cxmake( pvec1 * (fptype)nss, pvec2 * (fptype)nss ); + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt] + template + __host__ __device__ void + oxxxxx( const fptype momenta[], // input: momenta + const fptype fmass, // input: fermion mass + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar ) // input: particle# out of npar + { + mgDebug( 0, __FUNCTION__ ); + const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); + const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); + const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); + const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar ); + cxtype_sv* fo = W_ACCESS::kernelAccess( wavefunctions ); + fo[0] = cxmake( pvec0 * (fptype)nsf, pvec3 * (fptype)nsf ); + fo[1] = cxmake( pvec1 * (fptype)nsf, pvec2 * (fptype)nsf ); + const int nh = nhel * nsf; + if( fmass != 0. ) + { + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); +#ifndef MGONGPU_CPPSIMD + if( pp == 0. ) + { + // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! + fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0. }; // possibility of negative fermion masses + //sqm[1] = ( fmass < 0. ? -abs( sqm[0] ) : abs( sqm[0] ) ); // AV: why abs here? + sqm[1] = ( fmass < 0. ? -sqm[0] : sqm[0] ); // AV: removed an abs here + const int ip = -( ( 1 - nh ) / 2 ) * nhel; // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++ + const int im = ( 1 + nh ) / 2 * nhel; // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++ + fo[2] = cxmake( im * sqm[std::abs( ip )], 0 ); + fo[3] = cxmake( ip * nsf * sqm[std::abs( ip )], 0 ); + fo[4] = cxmake( im * nsf * sqm[std::abs( im )], 0 ); + fo[5] = cxmake( ip * sqm[std::abs( im )], 0 ); + } + else + { + const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5, + fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 }; + fptype omega[2] = { fpsqrt( pvec0 + pp ), 0. }; + omega[1] = fmass / omega[0]; + const int ip = ( 1 + nh ) / 2; // NB: Fortran is (3+nh)/2 because omega(2) has indexes 1,2 and not 0,1 + const int im = ( 1 - nh ) / 2; // NB: Fortran is (3-nh)/2 because omega(2) has indexes 1,2 and not 0,1 + const fptype sfomeg[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; + const fptype pp3 = fpmax( pp + pvec3, 0. ); + const cxtype chi[2] = { cxmake( fpsqrt( pp3 * (fptype)0.5 / pp ), 0. ), + ( ( pp3 == 0. ) ? cxmake( -nh, 0. ) + : cxmake( nh * pvec1, -pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) }; + fo[2] = sfomeg[1] * chi[im]; + fo[3] = sfomeg[1] * chi[ip]; + fo[4] = sfomeg[0] * chi[im]; + fo[5] = sfomeg[0] * chi[ip]; + } +#else + // Branch A: pp == 0. + // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! + fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses + sqm[1] = ( fmass < 0 ? -sqm[0] : sqm[0] ); // AV: removed an abs here (as above) + const int ipA = -( ( 1 - nh ) / 2 ) * nhel; + const int imA = ( 1 + nh ) / 2 * nhel; + const cxtype foA_2 = imA * sqm[std::abs( ipA )]; + const cxtype foA_3 = ipA * nsf * sqm[std::abs( ipA )]; + const cxtype foA_4 = imA * nsf * sqm[std::abs( imA )]; + const cxtype foA_5 = ipA * sqm[std::abs( imA )]; + // Branch B: pp != 0. + const fptype sf[2] = { fptype( 1 + nsf + ( 1 - nsf ) * nh ) * (fptype)0.5, + fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 }; + fptype_v omega[2] = { fpsqrt( pvec0 + pp ), 0 }; + omega[1] = fmass / omega[0]; + const int ipB = ( 1 + nh ) / 2; + const int imB = ( 1 - nh ) / 2; + const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; + const fptype_v pp3 = fpmax( pp + pvec3, 0. ); + const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / pp ), 0. ), + ( cxternary( ( pp3 == 0. ), + cxmake( -nh, 0. ), + cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) ) }; + const cxtype_v foB_2 = sfomeg[1] * chi[imB]; + const cxtype_v foB_3 = sfomeg[1] * chi[ipB]; + const cxtype_v foB_4 = sfomeg[0] * chi[imB]; + const cxtype_v foB_5 = sfomeg[0] * chi[ipB]; + // Choose between the results from branch A and branch B + const bool_v mask = ( pp == 0. ); + fo[2] = cxternary( mask, foA_2, foB_2 ); + fo[3] = cxternary( mask, foA_3, foB_3 ); + fo[4] = cxternary( mask, foA_4, foB_4 ); + fo[5] = cxternary( mask, foA_5, foB_5 ); +#endif + } + else + { + const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), + 0, + fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); + const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + cxternary( ( sqp0p3 == 0. ), + cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), + cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; + if( nh == 1 ) + { + fo[2] = chi[0]; + fo[3] = chi[1]; + fo[4] = cxzero_sv(); + fo[5] = cxzero_sv(); + } + else + { + fo[2] = cxzero_sv(); + fo[3] = cxzero_sv(); + fo[4] = chi[1]; + fo[5] = chi[0]; + } + } + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt] + // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0) + template + __host__ __device__ void + opzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar ) // input: particle# out of npar + { + mgDebug( 0, __FUNCTION__ ); + const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar ); + cxtype_sv* fo = W_ACCESS::kernelAccess( wavefunctions ); + fo[0] = cxmake( pvec3 * (fptype)nsf, pvec3 * (fptype)nsf ); + fo[1] = cxzero_sv(); + const int nh = nhel * nsf; + const cxtype_sv csqp0p3 = cxmake( fpsqrt( 2. * pvec3 ) * (fptype)nsf, 0. ); + fo[3] = cxzero_sv(); + fo[4] = cxzero_sv(); + if( nh == 1 ) + { + fo[2] = csqp0p3; + fo[5] = cxzero_sv(); + } + else + { + fo[2] = cxzero_sv(); + fo[5] = csqp0p3; + } + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt] + // ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0) + template + __host__ __device__ void + omzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar ) // input: particle# out of npar + { + mgDebug( 0, __FUNCTION__ ); + const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar ); + cxtype_sv* fo = W_ACCESS::kernelAccess( wavefunctions ); + fo[0] = cxmake( -pvec3 * (fptype)nsf, pvec3 * (fptype)nsf ); // remember pvec0 == -pvec3 + fo[1] = cxzero_sv(); + const int nh = nhel * nsf; + const cxtype_sv chi1 = cxmake( -nhel, 0. ) * fpsqrt( -2. * pvec3 ); + if( nh == 1 ) + { + fo[2] = cxzero_sv(); + fo[3] = chi1; + fo[4] = cxzero_sv(); + fo[5] = cxzero_sv(); + } + else + { + fo[2] = cxzero_sv(); + fo[3] = cxzero_sv(); + fo[4] = chi1; + //fo[5] = chi1; // AV: BUG! + fo[5] = cxzero_sv(); // AV: BUG FIX + } + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction fo[6] from the input momenta[npar*4*nevt] + // ASSUMPTIONS: (FMASS == 0) and (PT > 0) + template + __host__ __device__ void + oxzxxx( const fptype momenta[], // input: momenta + //const fptype fmass, // [skip: ASSUME fermion mass==0] + const int nhel, // input: -1 or +1 (helicity of fermion) + const int nsf, // input: +1 (particle) or -1 (antiparticle) + fptype wavefunctions[], // output: wavefunctions + const int ipar ) // input: particle# out of npar + { + mgDebug( 0, __FUNCTION__ ); + const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); + const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); + const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); + const fptype_sv& pvec3 = M_ACCESS::kernelAccessIp4IparConst( momenta, 3, ipar ); + cxtype_sv* fo = W_ACCESS::kernelAccess( wavefunctions ); + fo[0] = cxmake( pvec0 * (fptype)nsf, pvec3 * (fptype)nsf ); + fo[1] = cxmake( pvec1 * (fptype)nsf, pvec2 * (fptype)nsf ); + const int nh = nhel * nsf; + //const float sqp0p3 = sqrtf( pvec0 + pvec3 ) * nsf; // AV: why force a float here? + const fptype_sv sqp0p3 = fpsqrt( pvec0 + pvec3 ) * (fptype)nsf; + const cxtype_sv chi0 = cxmake( sqp0p3, 0. ); + const cxtype_sv chi1 = cxmake( (fptype)nh * pvec1 / sqp0p3, -pvec2 / sqp0p3 ); + if( nh == 1 ) + { + fo[2] = chi0; + fo[3] = chi1; + fo[4] = cxzero_sv(); + fo[5] = cxzero_sv(); + } + else + { + fo[2] = cxzero_sv(); + fo[3] = cxzero_sv(); + fo[4] = chi1; + fo[5] = chi0; + } + mgDebug( 1, __FUNCTION__ ); + return; + } + + //========================================================================== + + // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] + template + __device__ INLINE void + VVV1P0_1( const fptype allV2[], + const fptype allV3[], + const fptype allCOUP[], + const fptype M1, + const fptype W1, + fptype allV1[] ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] + template + __device__ INLINE void + FFV1_0( const fptype allF1[], + const fptype allF2[], + const fptype allV3[], + const fptype allCOUP[], + fptype allvertexes[] ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] + template + __device__ INLINE void + FFV1_1( const fptype allF2[], + const fptype allV3[], + const fptype allCOUP[], + const fptype M1, + const fptype W1, + fptype allF1[] ) ALWAYS_INLINE; + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] + template + __device__ INLINE void + FFV1_2( const fptype allF1[], + const fptype allV3[], + const fptype allCOUP[], + const fptype M2, + const fptype W2, + fptype allF2[] ) ALWAYS_INLINE; + + //========================================================================== + + // Compute the output wavefunction 'V1[6]' from the input wavefunctions V2[6], V3[6] + template + __device__ void + VVV1P0_1( const fptype allV2[], + const fptype allV3[], + const fptype allCOUP[], + const fptype M1, + const fptype W1, + fptype allV1[] ) + { + mgDebug( 0, __FUNCTION__ ); + const cxtype_sv* V2 = W_ACCESS::kernelAccessConst( allV2 ); + const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); + const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + cxtype_sv* V1 = W_ACCESS::kernelAccess( allV1 ); + const cxtype cI = cxmake( 0., 1. ); + const fptype_sv P2[4] = { +cxreal( V2[0] ), +cxreal( V2[1] ), +cximag( V2[1] ), +cximag( V2[0] ) }; + const fptype_sv P3[4] = { +cxreal( V3[0] ), +cxreal( V3[1] ), +cximag( V3[1] ), +cximag( V3[0] ) }; + V1[0] = +V2[0] + V3[0]; + V1[1] = +V2[1] + V3[1]; + const fptype_sv P1[4] = { -cxreal( V1[0] ), -cxreal( V1[1] ), -cximag( V1[1] ), -cximag( V1[0] ) }; + const cxtype_sv TMP0 = ( V3[2] * P1[0] - V3[3] * P1[1] - V3[4] * P1[2] - V3[5] * P1[3] ); + const cxtype_sv TMP1 = ( V3[2] * P2[0] - V3[3] * P2[1] - V3[4] * P2[2] - V3[5] * P2[3] ); + const cxtype_sv TMP2 = ( P1[0] * V2[2] - P1[1] * V2[3] - P1[2] * V2[4] - P1[3] * V2[5] ); + const cxtype_sv TMP3 = ( V2[2] * P3[0] - V2[3] * P3[1] - V2[4] * P3[2] - V2[5] * P3[3] ); + const cxtype_sv TMP4 = ( V3[2] * V2[2] - V3[3] * V2[3] - V3[4] * V2[4] - V3[5] * V2[5] ); + const cxtype_sv denom = COUP / ( ( P1[0] * P1[0] ) - ( P1[1] * P1[1] ) - ( P1[2] * P1[2] ) - ( P1[3] * P1[3] ) - M1 * ( M1 - cI * W1 ) ); + V1[2] = denom * ( TMP4 * ( -cI * P2[0] + cI * P3[0] ) + ( V2[2] * ( -cI * TMP0 + cI * TMP1 ) + V3[2] * ( +cI * TMP2 - cI * TMP3 ) ) ); + V1[3] = denom * ( TMP4 * ( -cI * P2[1] + cI * P3[1] ) + ( V2[3] * ( -cI * TMP0 + cI * TMP1 ) + V3[3] * ( +cI * TMP2 - cI * TMP3 ) ) ); + V1[4] = denom * ( TMP4 * ( -cI * P2[2] + cI * P3[2] ) + ( V2[4] * ( -cI * TMP0 + cI * TMP1 ) + V3[4] * ( +cI * TMP2 - cI * TMP3 ) ) ); + V1[5] = denom * ( TMP4 * ( -cI * P2[3] + cI * P3[3] ) + ( V2[5] * ( -cI * TMP0 + cI * TMP1 ) + V3[5] * ( +cI * TMP2 - cI * TMP3 ) ) ); + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output amplitude 'vertex' from the input wavefunctions F1[6], F2[6], V3[6] + template + __device__ void + FFV1_0( const fptype allF1[], + const fptype allF2[], + const fptype allV3[], + const fptype allCOUP[], + fptype allvertexes[] ) + { + mgDebug( 0, __FUNCTION__ ); + const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); + const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); + const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); + const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + cxtype_sv* vertex = A_ACCESS::kernelAccess( allvertexes ); + const cxtype cI = cxmake( 0., 1. ); + const cxtype_sv TMP5 = ( F1[2] * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) + ( F1[3] * ( F2[4] * ( V3[3] - cI * V3[4] ) + F2[5] * ( V3[2] - V3[5] ) ) + ( F1[4] * ( F2[2] * ( V3[2] - V3[5] ) - F2[3] * ( V3[3] + cI * V3[4] ) ) + F1[5] * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ) ); + ( *vertex ) = COUP * -cI * TMP5; + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction 'F1[6]' from the input wavefunctions F2[6], V3[6] + template + __device__ void + FFV1_1( const fptype allF2[], + const fptype allV3[], + const fptype allCOUP[], + const fptype M1, + const fptype W1, + fptype allF1[] ) + { + mgDebug( 0, __FUNCTION__ ); + const cxtype_sv* F2 = W_ACCESS::kernelAccessConst( allF2 ); + const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); + const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + cxtype_sv* F1 = W_ACCESS::kernelAccess( allF1 ); + const cxtype cI = cxmake( 0., 1. ); + F1[0] = +F2[0] + V3[0]; + F1[1] = +F2[1] + V3[1]; + const fptype_sv P1[4] = { -cxreal( F1[0] ), -cxreal( F1[1] ), -cximag( F1[1] ), -cximag( F1[0] ) }; + constexpr fptype one( 1. ); + const cxtype_sv denom = COUP / ( ( P1[0] * P1[0] ) - ( P1[1] * P1[1] ) - ( P1[2] * P1[2] ) - ( P1[3] * P1[3] ) - M1 * ( M1 - cI * W1 ) ); + F1[2] = denom * cI * ( F2[2] * ( P1[0] * ( -V3[2] + V3[5] ) + ( P1[1] * ( V3[3] - cI * V3[4] ) + ( P1[2] * ( +cI * V3[3] + V3[4] ) + P1[3] * ( -V3[2] + V3[5] ) ) ) ) + ( F2[3] * ( P1[0] * ( V3[3] + cI * V3[4] ) + ( P1[1] * ( -one ) * ( V3[2] + V3[5] ) + ( P1[2] * ( -one ) * ( +cI * ( V3[2] + V3[5] ) ) + P1[3] * ( V3[3] + cI * V3[4] ) ) ) ) + M1 * ( F2[4] * ( V3[2] + V3[5] ) + F2[5] * ( V3[3] + cI * V3[4] ) ) ) ); + F1[3] = denom * ( -cI ) * ( F2[2] * ( P1[0] * ( -V3[3] + cI * V3[4] ) + ( P1[1] * ( V3[2] - V3[5] ) + ( P1[2] * ( -cI * V3[2] + cI * V3[5] ) + P1[3] * ( V3[3] - cI * V3[4] ) ) ) ) + ( F2[3] * ( P1[0] * ( V3[2] + V3[5] ) + ( P1[1] * ( -one ) * ( V3[3] + cI * V3[4] ) + ( P1[2] * ( +cI * V3[3] - V3[4] ) - P1[3] * ( V3[2] + V3[5] ) ) ) ) + M1 * ( F2[4] * ( -V3[3] + cI * V3[4] ) + F2[5] * ( -V3[2] + V3[5] ) ) ) ); + F1[4] = denom * ( -cI ) * ( F2[4] * ( P1[0] * ( V3[2] + V3[5] ) + ( P1[1] * ( -V3[3] + cI * V3[4] ) + ( P1[2] * ( -one ) * ( +cI * V3[3] + V3[4] ) - P1[3] * ( V3[2] + V3[5] ) ) ) ) + ( F2[5] * ( P1[0] * ( V3[3] + cI * V3[4] ) + ( P1[1] * ( -V3[2] + V3[5] ) + ( P1[2] * ( -cI * V3[2] + cI * V3[5] ) - P1[3] * ( V3[3] + cI * V3[4] ) ) ) ) + M1 * ( F2[2] * ( -V3[2] + V3[5] ) + F2[3] * ( V3[3] + cI * V3[4] ) ) ) ); + F1[5] = denom * cI * ( F2[4] * ( P1[0] * ( -V3[3] + cI * V3[4] ) + ( P1[1] * ( V3[2] + V3[5] ) + ( P1[2] * ( -one ) * ( +cI * ( V3[2] + V3[5] ) ) + P1[3] * ( -V3[3] + cI * V3[4] ) ) ) ) + ( F2[5] * ( P1[0] * ( -V3[2] + V3[5] ) + ( P1[1] * ( V3[3] + cI * V3[4] ) + ( P1[2] * ( -cI * V3[3] + V3[4] ) + P1[3] * ( -V3[2] + V3[5] ) ) ) ) + M1 * ( F2[2] * ( -V3[3] + cI * V3[4] ) + F2[3] * ( V3[2] + V3[5] ) ) ) ); + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + + // Compute the output wavefunction 'F2[6]' from the input wavefunctions F1[6], V3[6] + template + __device__ void + FFV1_2( const fptype allF1[], + const fptype allV3[], + const fptype allCOUP[], + const fptype M2, + const fptype W2, + fptype allF2[] ) + { + mgDebug( 0, __FUNCTION__ ); + const cxtype_sv* F1 = W_ACCESS::kernelAccessConst( allF1 ); + const cxtype_sv* V3 = W_ACCESS::kernelAccessConst( allV3 ); + const cxtype_sv COUP = C_ACCESS::kernelAccessConst( allCOUP ); + cxtype_sv* F2 = W_ACCESS::kernelAccess( allF2 ); + const cxtype cI = cxmake( 0., 1. ); + F2[0] = +F1[0] + V3[0]; + F2[1] = +F1[1] + V3[1]; + const fptype_sv P2[4] = { -cxreal( F2[0] ), -cxreal( F2[1] ), -cximag( F2[1] ), -cximag( F2[0] ) }; + constexpr fptype one( 1. ); + const cxtype_sv denom = COUP / ( ( P2[0] * P2[0] ) - ( P2[1] * P2[1] ) - ( P2[2] * P2[2] ) - ( P2[3] * P2[3] ) - M2 * ( M2 - cI * W2 ) ); + F2[2] = denom * cI * ( F1[2] * ( P2[0] * ( V3[2] + V3[5] ) + ( P2[1] * ( -one ) * ( V3[3] + cI * V3[4] ) + ( P2[2] * ( +cI * V3[3] - V3[4] ) - P2[3] * ( V3[2] + V3[5] ) ) ) ) + ( F1[3] * ( P2[0] * ( V3[3] - cI * V3[4] ) + ( P2[1] * ( -V3[2] + V3[5] ) + ( P2[2] * ( +cI * V3[2] - cI * V3[5] ) + P2[3] * ( -V3[3] + cI * V3[4] ) ) ) ) + M2 * ( F1[4] * ( V3[2] - V3[5] ) + F1[5] * ( -V3[3] + cI * V3[4] ) ) ) ); + F2[3] = denom * ( -cI ) * ( F1[2] * ( P2[0] * ( -one ) * ( V3[3] + cI * V3[4] ) + ( P2[1] * ( V3[2] + V3[5] ) + ( P2[2] * ( +cI * ( V3[2] + V3[5] ) ) - P2[3] * ( V3[3] + cI * V3[4] ) ) ) ) + ( F1[3] * ( P2[0] * ( -V3[2] + V3[5] ) + ( P2[1] * ( V3[3] - cI * V3[4] ) + ( P2[2] * ( +cI * V3[3] + V3[4] ) + P2[3] * ( -V3[2] + V3[5] ) ) ) ) + M2 * ( F1[4] * ( V3[3] + cI * V3[4] ) - F1[5] * ( V3[2] + V3[5] ) ) ) ); + F2[4] = denom * ( -cI ) * ( F1[4] * ( P2[0] * ( -V3[2] + V3[5] ) + ( P2[1] * ( V3[3] + cI * V3[4] ) + ( P2[2] * ( -cI * V3[3] + V3[4] ) + P2[3] * ( -V3[2] + V3[5] ) ) ) ) + ( F1[5] * ( P2[0] * ( V3[3] - cI * V3[4] ) + ( P2[1] * ( -one ) * ( V3[2] + V3[5] ) + ( P2[2] * ( +cI * ( V3[2] + V3[5] ) ) + P2[3] * ( V3[3] - cI * V3[4] ) ) ) ) + M2 * ( F1[2] * ( -one ) * ( V3[2] + V3[5] ) + F1[3] * ( -V3[3] + cI * V3[4] ) ) ) ); + F2[5] = denom * cI * ( F1[4] * ( P2[0] * ( -one ) * ( V3[3] + cI * V3[4] ) + ( P2[1] * ( V3[2] - V3[5] ) + ( P2[2] * ( +cI * V3[2] - cI * V3[5] ) + P2[3] * ( V3[3] + cI * V3[4] ) ) ) ) + ( F1[5] * ( P2[0] * ( V3[2] + V3[5] ) + ( P2[1] * ( -V3[3] + cI * V3[4] ) + ( P2[2] * ( -one ) * ( +cI * V3[3] + V3[4] ) - P2[3] * ( V3[2] + V3[5] ) ) ) ) + M2 * ( F1[2] * ( V3[3] + cI * V3[4] ) + F1[3] * ( V3[2] - V3[5] ) ) ) ); + mgDebug( 1, __FUNCTION__ ); + return; + } + + //-------------------------------------------------------------------------- + +} // end namespace + +#endif // HelAmps_MSSM_SLHA2_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc new file mode 100644 index 0000000000..38dd9f2ebe --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc @@ -0,0 +1,1480 @@ +//========================================================================== +// This file has been automatically generated for CUDA/C++ standalone by +// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26 +// By the MadGraph5_aMC@NLO Development Team +// Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch +//========================================================================== + +#include "Parameters_MSSM_SLHA2.h" + +#include +#include + +#ifndef MGONGPU_HARDCODE_PARAM + +// Initialize static instance +Parameters_MSSM_SLHA2* Parameters_MSSM_SLHA2::instance = 0; + +// Function to get static instance - only one instance per program +Parameters_MSSM_SLHA2* +Parameters_MSSM_SLHA2::getInstance() +{ + if( instance == 0 ) + instance = new Parameters_MSSM_SLHA2(); + return instance; +} + +void +Parameters_MSSM_SLHA2::setIndependentParameters( SLHAReader& slha ) +{ + zero = 0; // define "zero" + ZERO = 0; // define "zero" + //std::vector indices(2, 0); // prepare a vector for indices + mdl_Wsl6 = slha.get_block_entry( "decay", 2000015, 2.699061e-01 ); + mdl_Wsl5 = slha.get_block_entry( "decay", 2000013, 2.161216e-01 ); + mdl_Wsl4 = slha.get_block_entry( "decay", 2000011, 2.161216e-01 ); + mdl_Wsu6 = slha.get_block_entry( "decay", 2000006, 7.373133e+00 ); + mdl_Wsd6 = slha.get_block_entry( "decay", 2000005, 8.015663e-01 ); + mdl_Wsu5 = slha.get_block_entry( "decay", 2000004, 1.152973e+00 ); + mdl_Wsd5 = slha.get_block_entry( "decay", 2000003, 2.858123e-01 ); + mdl_Wsu4 = slha.get_block_entry( "decay", 2000002, 1.152973e+00 ); + mdl_Wsd4 = slha.get_block_entry( "decay", 2000001, 2.858123e-01 ); + mdl_Wch2 = slha.get_block_entry( "decay", 1000037, 2.486895e+00 ); + mdl_Wneu4 = slha.get_block_entry( "decay", 1000035, 2.585851e+00 ); + mdl_Wneu3 = slha.get_block_entry( "decay", 1000025, 1.915985e+00 ); + mdl_Wch1 = slha.get_block_entry( "decay", 1000024, 1.704145e-02 ); + mdl_Wneu2 = slha.get_block_entry( "decay", 1000023, 2.077700e-02 ); + mdl_Wgo = slha.get_block_entry( "decay", 1000021, 5.506754e+00 ); + mdl_Wsn3 = slha.get_block_entry( "decay", 1000016, 1.475190e-01 ); + mdl_Wsl3 = slha.get_block_entry( "decay", 1000015, 1.483273e-01 ); + mdl_Wsn2 = slha.get_block_entry( "decay", 1000014, 1.498816e-01 ); + mdl_Wsl2 = slha.get_block_entry( "decay", 1000013, 2.136822e-01 ); + mdl_Wsn1 = slha.get_block_entry( "decay", 1000012, 1.498816e-01 ); + mdl_Wsl1 = slha.get_block_entry( "decay", 1000011, 2.136822e-01 ); + mdl_Wsu3 = slha.get_block_entry( "decay", 1000006, 2.021596e+00 ); + mdl_Wsd3 = slha.get_block_entry( "decay", 1000005, 3.736276e+00 ); + mdl_Wsu2 = slha.get_block_entry( "decay", 1000004, 5.477195e+00 ); + mdl_Wsd2 = slha.get_block_entry( "decay", 1000003, 5.312788e+00 ); + mdl_Wsu1 = slha.get_block_entry( "decay", 1000002, 5.477195e+00 ); + mdl_Wsd1 = slha.get_block_entry( "decay", 1000001, 5.312788e+00 ); + mdl_WH = slha.get_block_entry( "decay", 37, 5.469628e-01 ); + mdl_WA0 = slha.get_block_entry( "decay", 36, 6.321785e-01 ); + mdl_WH02 = slha.get_block_entry( "decay", 35, 5.748014e-01 ); + mdl_WH01 = slha.get_block_entry( "decay", 25, 1.986108e-03 ); + mdl_WW = slha.get_block_entry( "decay", 24, 2.002822e+00 ); + mdl_WZ = slha.get_block_entry( "decay", 23, 2.411433e+00 ); + mdl_WT = slha.get_block_entry( "decay", 6, 1.561950e+00 ); + indices[0] = 3; + indices[1] = 3; + mdl_Ryu3x3 = slha.get_block_entry( "yu", indices, 8.928445e-01 ); + indices[0] = 3; + indices[1] = 3; + mdl_Rye3x3 = slha.get_block_entry( "ye", indices, 1.008908e-01 ); + indices[0] = 3; + indices[1] = 3; + mdl_Ryd3x3 = slha.get_block_entry( "yd", indices, 1.388402e-01 ); + indices[0] = 2; + indices[1] = 2; + mdl_RVV2x2 = slha.get_block_entry( "vmix", indices, 9.725578e-01 ); + indices[0] = 2; + indices[1] = 1; + mdl_RVV2x1 = slha.get_block_entry( "vmix", indices, 2.326612e-01 ); + indices[0] = 1; + indices[1] = 2; + mdl_RVV1x2 = slha.get_block_entry( "vmix", indices, -2.326612e-01 ); + indices[0] = 1; + indices[1] = 1; + mdl_RVV1x1 = slha.get_block_entry( "vmix", indices, 9.725578e-01 ); + indices[0] = 3; + indices[1] = 3; + mdl_RCKM3x3 = slha.get_block_entry( "vckm", indices, 1.000000e+00 ); + indices[0] = 2; + indices[1] = 2; + mdl_RCKM2x2 = slha.get_block_entry( "vckm", indices, 1.000000e+00 ); + indices[0] = 1; + indices[1] = 1; + mdl_RCKM1x1 = slha.get_block_entry( "vckm", indices, 1.000000e+00 ); + indices[0] = 6; + indices[1] = 6; + mdl_RRu6x6 = slha.get_block_entry( "usqmix", indices, -5.536450e-01 ); + indices[0] = 6; + indices[1] = 3; + mdl_RRu6x3 = slha.get_block_entry( "usqmix", indices, 8.327528e-01 ); + indices[0] = 5; + indices[1] = 5; + mdl_RRu5x5 = slha.get_block_entry( "usqmix", indices, 1.000000e+00 ); + indices[0] = 4; + indices[1] = 4; + mdl_RRu4x4 = slha.get_block_entry( "usqmix", indices, 1.000000e+00 ); + indices[0] = 3; + indices[1] = 6; + mdl_RRu3x6 = slha.get_block_entry( "usqmix", indices, 8.327528e-01 ); + indices[0] = 3; + indices[1] = 3; + mdl_RRu3x3 = slha.get_block_entry( "usqmix", indices, 5.536450e-01 ); + indices[0] = 2; + indices[1] = 2; + mdl_RRu2x2 = slha.get_block_entry( "usqmix", indices, 1.000000e+00 ); + indices[0] = 1; + indices[1] = 1; + mdl_RRu1x1 = slha.get_block_entry( "usqmix", indices, 1.000000e+00 ); + indices[0] = 3; + indices[1] = 3; + mdl_RMNS3x3 = slha.get_block_entry( "upmns", indices, 1.000000e+00 ); + indices[0] = 2; + indices[1] = 2; + mdl_RMNS2x2 = slha.get_block_entry( "upmns", indices, 1.000000e+00 ); + indices[0] = 1; + indices[1] = 1; + mdl_RMNS1x1 = slha.get_block_entry( "upmns", indices, 1.000000e+00 ); + indices[0] = 2; + indices[1] = 2; + mdl_RUU2x2 = slha.get_block_entry( "umix", indices, 9.168349e-01 ); + indices[0] = 2; + indices[1] = 1; + mdl_RUU2x1 = slha.get_block_entry( "umix", indices, 3.992666e-01 ); + indices[0] = 1; + indices[1] = 2; + mdl_RUU1x2 = slha.get_block_entry( "umix", indices, -3.992666e-01 ); + indices[0] = 1; + indices[1] = 1; + mdl_RUU1x1 = slha.get_block_entry( "umix", indices, 9.168349e-01 ); + indices[0] = 3; + indices[1] = 3; + mdl_Rtu3x3 = slha.get_block_entry( "tu", indices, -4.447525e+02 ); + indices[0] = 3; + indices[1] = 3; + mdl_Rte3x3 = slha.get_block_entry( "te", indices, -2.540197e+01 ); + indices[0] = 3; + indices[1] = 3; + mdl_Rtd3x3 = slha.get_block_entry( "td", indices, -1.106937e+02 ); + indices[0] = 3; + indices[1] = 3; + mdl_RRn3x3 = slha.get_block_entry( "snumix", indices, 1.000000e+00 ); + indices[0] = 2; + indices[1] = 2; + mdl_RRn2x2 = slha.get_block_entry( "snumix", indices, 1.000000e+00 ); + indices[0] = 1; + indices[1] = 1; + mdl_RRn1x1 = slha.get_block_entry( "snumix", indices, 1.000000e+00 ); + //aS = slha.get_block_entry( "sminputs", 3, 1.180000e-01 ); // now retrieved event-by-event (as G) from Fortran (running alphas #373) + aEWM1 = slha.get_block_entry( "sminputs", 1, 1.279340e+02 ); + indices[0] = 6; + indices[1] = 6; + mdl_RRl6x6 = slha.get_block_entry( "selmix", indices, -2.824872e-01 ); + indices[0] = 6; + indices[1] = 3; + mdl_RRl6x3 = slha.get_block_entry( "selmix", indices, 9.592711e-01 ); + indices[0] = 5; + indices[1] = 5; + mdl_RRl5x5 = slha.get_block_entry( "selmix", indices, 1.000000e+00 ); + indices[0] = 4; + indices[1] = 4; + mdl_RRl4x4 = slha.get_block_entry( "selmix", indices, 1.000000e+00 ); + indices[0] = 3; + indices[1] = 6; + mdl_RRl3x6 = slha.get_block_entry( "selmix", indices, 9.592711e-01 ); + indices[0] = 3; + indices[1] = 3; + mdl_RRl3x3 = slha.get_block_entry( "selmix", indices, 2.824872e-01 ); + indices[0] = 2; + indices[1] = 2; + mdl_RRl2x2 = slha.get_block_entry( "selmix", indices, 1.000000e+00 ); + indices[0] = 1; + indices[1] = 1; + mdl_RRl1x1 = slha.get_block_entry( "selmix", indices, 1.000000e+00 ); + indices[0] = 4; + indices[1] = 4; + mdl_RNN4x4 = slha.get_block_entry( "nmix", indices, -6.843778e-01 ); + indices[0] = 4; + indices[1] = 3; + mdl_RNN4x3 = slha.get_block_entry( "nmix", indices, 6.492260e-01 ); + indices[0] = 4; + indices[1] = 2; + mdl_RNN4x2 = slha.get_block_entry( "nmix", indices, 3.107390e-01 ); + indices[0] = 4; + indices[1] = 1; + mdl_RNN4x1 = slha.get_block_entry( "nmix", indices, -1.165071e-01 ); + indices[0] = 3; + indices[1] = 4; + mdl_RNN3x4 = slha.get_block_entry( "nmix", indices, 7.102270e-01 ); + indices[0] = 3; + indices[1] = 3; + mdl_RNN3x3 = slha.get_block_entry( "nmix", indices, 6.958775e-01 ); + indices[0] = 3; + indices[1] = 2; + mdl_RNN3x2 = slha.get_block_entry( "nmix", indices, 8.770049e-02 ); + indices[0] = 3; + indices[1] = 1; + mdl_RNN3x1 = slha.get_block_entry( "nmix", indices, -6.033880e-02 ); + indices[0] = 2; + indices[1] = 4; + mdl_RNN2x4 = slha.get_block_entry( "nmix", indices, 1.561507e-01 ); + indices[0] = 2; + indices[1] = 3; + mdl_RNN2x3 = slha.get_block_entry( "nmix", indices, -2.698467e-01 ); + indices[0] = 2; + indices[1] = 2; + mdl_RNN2x2 = slha.get_block_entry( "nmix", indices, 9.449493e-01 ); + indices[0] = 2; + indices[1] = 1; + mdl_RNN2x1 = slha.get_block_entry( "nmix", indices, 9.935054e-02 ); + indices[0] = 1; + indices[1] = 4; + mdl_RNN1x4 = slha.get_block_entry( "nmix", indices, -5.311861e-02 ); + indices[0] = 1; + indices[1] = 3; + mdl_RNN1x3 = slha.get_block_entry( "nmix", indices, 1.464340e-01 ); + indices[0] = 1; + indices[1] = 2; + mdl_RNN1x2 = slha.get_block_entry( "nmix", indices, -5.311036e-02 ); + indices[0] = 1; + indices[1] = 1; + mdl_RNN1x1 = slha.get_block_entry( "nmix", indices, 9.863644e-01 ); + indices[0] = 3; + indices[1] = 3; + mdl_RmU23x3 = slha.get_block_entry( "msu2", indices, 1.791371e+05 ); + indices[0] = 1; + indices[1] = 1; + mdl_RmU21x1 = slha.get_block_entry( "msu2", indices, 2.803821e+05 ); + indices[0] = 3; + indices[1] = 3; + mdl_RmQ23x3 = slha.get_block_entry( "msq2", indices, 2.487654e+05 ); + indices[0] = 1; + indices[1] = 1; + mdl_RmQ21x1 = slha.get_block_entry( "msq2", indices, 2.998367e+05 ); + mdl_mHu2 = slha.get_block_entry( "msoft", 22, -1.288001e+05 ); + mdl_mHd2 = slha.get_block_entry( "msoft", 21, 3.233749e+04 ); + mdl_RMx3 = slha.get_block_entry( "msoft", 3, 5.882630e+02 ); + mdl_RMx2 = slha.get_block_entry( "msoft", 2, 1.915042e+02 ); + mdl_RMx1 = slha.get_block_entry( "msoft", 1, 1.013965e+02 ); + indices[0] = 3; + indices[1] = 3; + mdl_RmL23x3 = slha.get_block_entry( "msl2", indices, 3.782868e+04 ); + indices[0] = 1; + indices[1] = 1; + mdl_RmL21x1 = slha.get_block_entry( "msl2", indices, 3.815567e+04 ); + indices[0] = 3; + indices[1] = 3; + mdl_RmE23x3 = slha.get_block_entry( "mse2", indices, 1.796764e+04 ); + indices[0] = 1; + indices[1] = 1; + mdl_RmE21x1 = slha.get_block_entry( "mse2", indices, 1.863063e+04 ); + indices[0] = 3; + indices[1] = 3; + mdl_RmD23x3 = slha.get_block_entry( "msd2", indices, 2.702620e+05 ); + indices[0] = 1; + indices[1] = 1; + mdl_RmD21x1 = slha.get_block_entry( "msd2", indices, 2.736847e+05 ); + mdl_Msl6 = slha.get_block_entry( "mass", 2000015, 2.068678e+02 ); + mdl_Msl4 = slha.get_block_entry( "mass", 2000011, 1.441028e+02 ); + mdl_Msu6 = slha.get_block_entry( "mass", 2000006, 5.857858e+02 ); + mdl_Msd6 = slha.get_block_entry( "mass", 2000005, 5.437267e+02 ); + mdl_Msu4 = slha.get_block_entry( "mass", 2000002, 5.492593e+02 ); + mdl_Msd4 = slha.get_block_entry( "mass", 2000001, 5.452285e+02 ); + mdl_Mch2 = slha.get_block_entry( "mass", 1000037, 3.799393e+02 ); + mdl_Mneu4 = slha.get_block_entry( "mass", 1000035, 3.817294e+02 ); + mdl_Mneu3 = slha.get_block_entry( "mass", 1000025, -3.637560e+02 ); + mdl_Mch1 = slha.get_block_entry( "mass", 1000024, 1.816965e+02 ); + mdl_Mneu2 = slha.get_block_entry( "mass", 1000023, 1.810882e+02 ); + mdl_Mneu1 = slha.get_block_entry( "mass", 1000022, 9.668807e+01 ); + mdl_Mgo = slha.get_block_entry( "mass", 1000021, 6.077137e+02 ); + mdl_Msn3 = slha.get_block_entry( "mass", 1000016, 1.847085e+02 ); + mdl_Msl3 = slha.get_block_entry( "mass", 1000015, 1.344909e+02 ); + mdl_Msn1 = slha.get_block_entry( "mass", 1000012, 1.852583e+02 ); + mdl_Msl1 = slha.get_block_entry( "mass", 1000011, 2.029157e+02 ); + mdl_Msu3 = slha.get_block_entry( "mass", 1000006, 3.996685e+02 ); + mdl_Msd3 = slha.get_block_entry( "mass", 1000005, 5.130652e+02 ); + mdl_Msu1 = slha.get_block_entry( "mass", 1000002, 5.611190e+02 ); + mdl_Msd1 = slha.get_block_entry( "mass", 1000001, 5.684411e+02 ); + mdl_MH = slha.get_block_entry( "mass", 37, 4.078790e+02 ); + mdl_MA0 = slha.get_block_entry( "mass", 36, 3.995839e+02 ); + mdl_MH02 = slha.get_block_entry( "mass", 35, 3.999601e+02 ); + mdl_MH01 = slha.get_block_entry( "mass", 25, 1.108991e+02 ); + mdl_MW = slha.get_block_entry( "mass", 24, 7.982901e+01 ); + mdl_MZ = slha.get_block_entry( "mass", 23, 9.118760e+01 ); + mdl_Mta = slha.get_block_entry( "mass", 15, 1.777000e+00 ); + mdl_MT = slha.get_block_entry( "mass", 6, 1.750000e+02 ); + mdl_MB = slha.get_block_entry( "mass", 5, 4.889917e+00 ); + mdl_MA2 = slha.get_block_entry( "hmix", 4, 1.664391e+05 ); + mdl_tb = slha.get_block_entry( "hmix", 2, 9.748624e+00 ); + mdl_RMUH = slha.get_block_entry( "hmix", 1, 3.576810e+02 ); + mdl_alp = slha.get_block_entry( "fralpha", 1, -1.138252e-01 ); + indices[0] = 6; + indices[1] = 6; + mdl_RRd6x6 = slha.get_block_entry( "dsqmix", indices, 9.387379e-01 ); + indices[0] = 6; + indices[1] = 3; + mdl_RRd6x3 = slha.get_block_entry( "dsqmix", indices, -3.446319e-01 ); + indices[0] = 5; + indices[1] = 5; + mdl_RRd5x5 = slha.get_block_entry( "dsqmix", indices, 1.000000e+00 ); + indices[0] = 4; + indices[1] = 4; + mdl_RRd4x4 = slha.get_block_entry( "dsqmix", indices, 1.000000e+00 ); + indices[0] = 3; + indices[1] = 6; + mdl_RRd3x6 = slha.get_block_entry( "dsqmix", indices, 3.446319e-01 ); + indices[0] = 3; + indices[1] = 3; + mdl_RRd3x3 = slha.get_block_entry( "dsqmix", indices, 9.387379e-01 ); + indices[0] = 2; + indices[1] = 2; + mdl_RRd2x2 = slha.get_block_entry( "dsqmix", indices, 1.000000e+00 ); + indices[0] = 1; + indices[1] = 1; + mdl_RRd1x1 = slha.get_block_entry( "dsqmix", indices, 1.000000e+00 ); + mdl_Msd5 = 1. * mdl_Msd4; + mdl_Msd2 = 1. * mdl_Msd1; + mdl_Msu5 = 1. * mdl_Msu4; + mdl_Msu2 = 1. * mdl_Msu1; + mdl_Msl5 = 1. * mdl_Msl4; + mdl_Msl2 = 1. * mdl_Msl1; + mdl_Msn2 = 1. * mdl_Msn1; + mdl_RmU22x2 = 1. * mdl_RmU21x1; + mdl_RmQ22x2 = 1. * mdl_RmQ21x1; + mdl_RmL22x2 = 1. * mdl_RmL21x1; + mdl_RmE22x2 = 1. * mdl_RmE21x1; + mdl_RmD22x2 = 1. * mdl_RmD21x1; + mdl_conjg__Rn3x3 = 1.; + mdl_conjg__CKM3x3 = 1.; + mdl_Ru4x4 = 1.; + mdl_Ru1x1 = 1.; + mdl_Rn3x3 = 1.; + mdl_Rn1x1 = 1.; + mdl_Rl4x4 = 1.; + mdl_Rl1x1 = 1.; + mdl_Rd4x4 = 1.; + mdl_Rd1x1 = 1.; + mdl_I98x11 = 1.; + mdl_I97x11 = 1.; + mdl_I96x11 = 1.; + mdl_I93x11 = 1.; + mdl_I92x11 = 1.; + mdl_I87x11 = 1.; + mdl_I82x11 = 1.; + mdl_I74x11 = 1.; + mdl_I6x44 = 1.; + mdl_I5x11 = 1.; + mdl_I53x11 = 1.; + mdl_I52x44 = 1.; + mdl_I51x11 = 1.; + mdl_I39x11 = 1.; + mdl_I31x11 = 1.; + mdl_I26x44 = 1.; + mdl_I25x11 = 1.; + mdl_I12x11 = 1.; + mdl_I102x44 = 1.; + mdl_I101x44 = 1.; + mdl_I100x44 = 1.; + mdl_CKM3x3 = 1.; + mdl_atan__tb = atan( mdl_tb ); + mdl_beta = mdl_atan__tb; + mdl_cw = mdl_MW / mdl_MZ; + mdl_mD21x1 = mdl_RmD21x1; + mdl_mD22x2 = mdl_RmD22x2; + mdl_mD23x3 = mdl_RmD23x3; + mdl_mE21x1 = mdl_RmE21x1; + mdl_mE22x2 = mdl_RmE22x2; + mdl_mE23x3 = mdl_RmE23x3; + mdl_mL21x1 = mdl_RmL21x1; + mdl_mL22x2 = mdl_RmL22x2; + mdl_mL23x3 = mdl_RmL23x3; + mdl_mQ21x1 = mdl_RmQ21x1; + mdl_mQ22x2 = mdl_RmQ22x2; + mdl_mQ23x3 = mdl_RmQ23x3; + mdl_mU21x1 = mdl_RmU21x1; + mdl_mU22x2 = mdl_RmU22x2; + mdl_mU23x3 = mdl_RmU23x3; + mdl_MUH = mdl_RMUH; + mdl_Mx1 = mdl_RMx1; + mdl_Mx2 = mdl_RMx2; + mdl_Mx3 = mdl_RMx3; + mdl_NN1x1 = mdl_RNN1x1; + mdl_NN1x2 = mdl_RNN1x2; + mdl_NN1x3 = mdl_RNN1x3; + mdl_NN1x4 = mdl_RNN1x4; + mdl_NN2x1 = mdl_RNN2x1; + mdl_NN2x2 = mdl_RNN2x2; + mdl_NN2x3 = mdl_RNN2x3; + mdl_NN2x4 = mdl_RNN2x4; + mdl_NN3x1 = mdl_RNN3x1; + mdl_NN3x2 = mdl_RNN3x2; + mdl_NN3x3 = mdl_RNN3x3; + mdl_NN3x4 = mdl_RNN3x4; + mdl_NN4x1 = mdl_RNN4x1; + mdl_NN4x2 = mdl_RNN4x2; + mdl_NN4x3 = mdl_RNN4x3; + mdl_NN4x4 = mdl_RNN4x4; + mdl_Rd3x3 = mdl_RRd3x3; + mdl_Rd3x6 = mdl_RRd3x6; + mdl_Rd6x3 = mdl_RRd6x3; + mdl_Rd6x6 = mdl_RRd6x6; + mdl_Rl3x3 = mdl_RRl3x3; + mdl_Rl3x6 = mdl_RRl3x6; + mdl_Rl6x3 = mdl_RRl6x3; + mdl_Rl6x6 = mdl_RRl6x6; + mdl_Ru3x3 = mdl_RRu3x3; + mdl_Ru3x6 = mdl_RRu3x6; + mdl_Ru6x3 = mdl_RRu6x3; + mdl_Ru6x6 = mdl_RRu6x6; + mdl_UU1x1 = mdl_RUU1x1; + mdl_UU1x2 = mdl_RUU1x2; + mdl_UU2x1 = mdl_RUU2x1; + mdl_UU2x2 = mdl_RUU2x2; + mdl_VV1x1 = mdl_RVV1x1; + mdl_VV1x2 = mdl_RVV1x2; + mdl_VV2x1 = mdl_RVV2x1; + mdl_VV2x2 = mdl_RVV2x2; + mdl_td3x3 = mdl_Rtd3x3; + mdl_te3x3 = mdl_Rte3x3; + mdl_tu3x3 = mdl_Rtu3x3; + mdl_yd3x3 = mdl_Ryd3x3; + mdl_ye3x3 = mdl_Rye3x3; + mdl_yu3x3 = mdl_Ryu3x3; + mdl_MZ__exp__2 = ( ( mdl_MZ ) * ( mdl_MZ ) ); + mdl_bb = ( ( -mdl_mHd2 + mdl_mHu2 - mdl_MZ__exp__2 * cos( 2. * mdl_beta ) ) * tan( 2. * mdl_beta ) ) / 2.; + mdl_cw__exp__2 = ( ( mdl_cw ) * ( mdl_cw ) ); + mdl_sw = sqrt( 1. - mdl_cw__exp__2 ); + mdl_cos__beta = cos( mdl_beta ); + mdl_sin__beta = sin( mdl_beta ); + mdl_conjg__yu3x3 = conj( mdl_yu3x3 ); + mdl_I1x33 = mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; + mdl_conjg__yd3x3 = conj( mdl_yd3x3 ); + mdl_I10x33 = mdl_Rd3x3 * mdl_conjg__yd3x3; + mdl_I10x36 = mdl_Rd6x3 * mdl_conjg__yd3x3; + mdl_conjg__Rd3x6 = conj( mdl_Rd3x6 ); + mdl_I100x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; + mdl_I100x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; + mdl_conjg__Rd6x6 = conj( mdl_Rd6x6 ); + mdl_I100x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; + mdl_I100x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; + mdl_conjg__Rl3x6 = conj( mdl_Rl3x6 ); + mdl_I101x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; + mdl_I101x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; + mdl_conjg__Rl6x6 = conj( mdl_Rl6x6 ); + mdl_I101x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; + mdl_I101x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; + mdl_conjg__Ru3x6 = conj( mdl_Ru3x6 ); + mdl_I102x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; + mdl_I102x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; + mdl_conjg__Ru6x6 = conj( mdl_Ru6x6 ); + mdl_I102x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; + mdl_I102x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; + mdl_I11x33 = mdl_Rd3x6 * mdl_yd3x3; + mdl_I11x36 = mdl_Rd6x6 * mdl_yd3x3; + mdl_conjg__Rd3x3 = conj( mdl_Rd3x3 ); + mdl_I12x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; + mdl_I12x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; + mdl_conjg__Rd6x3 = conj( mdl_Rd6x3 ); + mdl_I12x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; + mdl_I12x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; + mdl_I13x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; + mdl_I13x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; + mdl_I13x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; + mdl_I13x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; + mdl_conjg__td3x3 = conj( mdl_td3x3 ); + mdl_I14x33 = mdl_Rd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + mdl_I14x36 = mdl_Rd6x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + mdl_I14x63 = mdl_Rd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + mdl_I14x66 = mdl_Rd6x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + mdl_I15x33 = mdl_Rd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + mdl_I15x36 = mdl_Rd6x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + mdl_I15x63 = mdl_Rd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + mdl_I15x66 = mdl_Rd6x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + mdl_I16x33 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__Rd3x3; + mdl_I16x36 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__Rd3x3; + mdl_I16x63 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__Rd6x3; + mdl_I16x66 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__Rd6x3; + mdl_I17x33 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + mdl_I17x36 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + mdl_I17x63 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + mdl_I17x66 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + mdl_I18x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd3x3; + mdl_I18x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd3x3; + mdl_I18x63 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd6x3; + mdl_I18x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd6x3; + mdl_I19x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + mdl_I19x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + mdl_I19x63 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + mdl_I19x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + mdl_I2x33 = mdl_yd3x3 * mdl_conjg__CKM3x3; + mdl_I20x33 = mdl_CKM3x3 * mdl_conjg__yd3x3; + mdl_I21x33 = mdl_CKM3x3 * mdl_yu3x3; + mdl_conjg__ye3x3 = conj( mdl_ye3x3 ); + mdl_I22x33 = mdl_conjg__ye3x3; + mdl_I23x33 = mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + mdl_I23x36 = mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + mdl_conjg__Rl3x3 = conj( mdl_Rl3x3 ); + mdl_I24x33 = mdl_ye3x3 * mdl_conjg__Rl3x3; + mdl_conjg__Rl6x3 = conj( mdl_Rl6x3 ); + mdl_I24x36 = mdl_ye3x3 * mdl_conjg__Rl6x3; + mdl_I25x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; + mdl_I25x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; + mdl_I25x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; + mdl_I25x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; + mdl_I26x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; + mdl_I26x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; + mdl_I26x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; + mdl_I26x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; + mdl_I27x33 = mdl_Rl3x3 * mdl_conjg__ye3x3; + mdl_I27x36 = mdl_Rl6x3 * mdl_conjg__ye3x3; + mdl_I28x33 = mdl_Rl3x6 * mdl_ye3x3; + mdl_I28x36 = mdl_Rl6x6 * mdl_ye3x3; + mdl_I29x33 = mdl_Rl3x3; + mdl_I29x36 = mdl_Rl6x3; + mdl_I3x33 = mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + mdl_I3x36 = mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + mdl_I30x33 = mdl_Rl3x6 * mdl_ye3x3; + mdl_I30x36 = mdl_Rl6x6 * mdl_ye3x3; + mdl_I31x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; + mdl_I31x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; + mdl_I31x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; + mdl_I31x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; + mdl_I32x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; + mdl_I32x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; + mdl_I32x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; + mdl_I32x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; + mdl_conjg__te3x3 = conj( mdl_te3x3 ); + mdl_I33x33 = mdl_Rl3x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; + mdl_I33x36 = mdl_Rl6x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; + mdl_I33x63 = mdl_Rl3x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; + mdl_I33x66 = mdl_Rl6x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; + mdl_I34x33 = mdl_Rl3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + mdl_I34x36 = mdl_Rl6x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + mdl_I34x63 = mdl_Rl3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + mdl_I34x66 = mdl_Rl6x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + mdl_I35x33 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rl3x3; + mdl_I35x36 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rl3x3; + mdl_I35x63 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rl6x3; + mdl_I35x66 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rl6x3; + mdl_I36x33 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; + mdl_I36x36 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; + mdl_I36x63 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; + mdl_I36x66 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; + mdl_I37x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl3x3; + mdl_I37x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl3x3; + mdl_I37x63 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl6x3; + mdl_I37x66 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl6x3; + mdl_I38x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + mdl_I38x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + mdl_I38x63 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + mdl_I38x66 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + mdl_I39x33 = mdl_Rl3x3 * mdl_conjg__Rn3x3; + mdl_I39x36 = mdl_Rl6x3 * mdl_conjg__Rn3x3; + mdl_I4x33 = mdl_yd3x3 * mdl_conjg__Rd3x3; + mdl_I4x36 = mdl_yd3x3 * mdl_conjg__Rd6x3; + mdl_I40x33 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rn3x3; + mdl_I40x36 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rn3x3; + mdl_I41x33 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rn3x3 * mdl_conjg__ye3x3; + mdl_I41x36 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rn3x3 * mdl_conjg__ye3x3; + mdl_I42x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rn3x3; + mdl_I42x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rn3x3; + mdl_I44x33 = mdl_Rn3x3 * mdl_conjg__ye3x3; + mdl_I45x33 = mdl_Rn3x3 * mdl_conjg__Rl3x3; + mdl_I45x36 = mdl_Rn3x3 * mdl_conjg__Rl6x3; + mdl_I46x33 = mdl_Rn3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + mdl_I46x36 = mdl_Rn3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + mdl_I47x33 = mdl_Rn3x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; + mdl_I47x36 = mdl_Rn3x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; + mdl_I48x33 = mdl_Rn3x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; + mdl_I48x36 = mdl_Rn3x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; + mdl_I49x33 = mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + mdl_I49x36 = mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + mdl_I5x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; + mdl_I5x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; + mdl_I5x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; + mdl_I5x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; + mdl_conjg__Ru3x3 = conj( mdl_Ru3x3 ); + mdl_I50x33 = mdl_yu3x3 * mdl_conjg__Ru3x3; + mdl_conjg__Ru6x3 = conj( mdl_Ru6x3 ); + mdl_I50x36 = mdl_yu3x3 * mdl_conjg__Ru6x3; + mdl_I51x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; + mdl_I51x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; + mdl_I51x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; + mdl_I51x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; + mdl_I52x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; + mdl_I52x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; + mdl_I52x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; + mdl_I52x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; + mdl_I53x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + mdl_I53x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + mdl_I53x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + mdl_I53x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + mdl_conjg__tu3x3 = conj( mdl_tu3x3 ); + mdl_I54x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + mdl_I54x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + mdl_I54x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + mdl_I54x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + mdl_I55x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + mdl_I55x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + mdl_I55x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + mdl_I55x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + mdl_I56x33 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + mdl_I56x36 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + mdl_I56x63 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + mdl_I56x66 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + mdl_I57x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + mdl_I57x36 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + mdl_I57x63 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + mdl_I57x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + mdl_I58x33 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yd3x3; + mdl_I58x36 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yd3x3; + mdl_I58x63 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yd3x3; + mdl_I58x66 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yd3x3; + mdl_I59x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + mdl_I59x36 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + mdl_I59x63 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + mdl_I59x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + mdl_I6x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; + mdl_I6x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; + mdl_I6x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; + mdl_I6x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; + mdl_I60x33 = mdl_Rd3x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + mdl_I60x36 = mdl_Rd3x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + mdl_I60x63 = mdl_Rd6x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + mdl_I60x66 = mdl_Rd6x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + mdl_I61x33 = mdl_Ru3x3 * mdl_conjg__yu3x3; + mdl_I61x36 = mdl_Ru6x3 * mdl_conjg__yu3x3; + mdl_I62x33 = mdl_Ru3x6 * mdl_yu3x3; + mdl_I62x36 = mdl_Ru6x6 * mdl_yu3x3; + mdl_I63x33 = mdl_CKM3x3 * mdl_Ru3x3; + mdl_I63x36 = mdl_CKM3x3 * mdl_Ru6x3; + mdl_I64x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__yd3x3; + mdl_I64x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__yd3x3; + mdl_I65x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3; + mdl_I65x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3; + mdl_I66x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x3; + mdl_I66x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x3; + mdl_I66x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x3; + mdl_I66x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x3; + mdl_I67x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + mdl_I67x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + mdl_I67x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + mdl_I67x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + mdl_I68x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + mdl_I68x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + mdl_I68x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + mdl_I68x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + mdl_I69x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Rd3x3; + mdl_I69x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Rd3x3; + mdl_I69x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Rd6x3; + mdl_I69x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Rd6x3; + mdl_I7x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3; + mdl_I7x36 = mdl_Rd6x3 * mdl_conjg__CKM3x3; + mdl_I70x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + mdl_I70x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + mdl_I70x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + mdl_I70x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + mdl_I71x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yu3x3; + mdl_I71x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yu3x3; + mdl_I71x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yu3x3; + mdl_I71x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yu3x3; + mdl_I72x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd3x3; + mdl_I72x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd3x3; + mdl_I72x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd6x3; + mdl_I72x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd6x3; + mdl_I73x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + mdl_I73x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + mdl_I73x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + mdl_I73x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + mdl_I74x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; + mdl_I74x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; + mdl_I74x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; + mdl_I74x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; + mdl_I75x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; + mdl_I75x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; + mdl_I75x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; + mdl_I75x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; + mdl_I76x33 = mdl_Ru3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + mdl_I76x36 = mdl_Ru6x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + mdl_I76x63 = mdl_Ru3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + mdl_I76x66 = mdl_Ru6x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + mdl_I77x33 = mdl_Ru3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + mdl_I77x36 = mdl_Ru6x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + mdl_I77x63 = mdl_Ru3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + mdl_I77x66 = mdl_Ru6x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + mdl_I78x33 = mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Ru3x3; + mdl_I78x36 = mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Ru3x3; + mdl_I78x63 = mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Ru6x3; + mdl_I78x66 = mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Ru6x3; + mdl_I79x33 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru3x3; + mdl_I79x36 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru3x3; + mdl_I79x63 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru6x3; + mdl_I79x66 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru6x3; + mdl_I8x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; + mdl_I8x36 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; + mdl_I80x33 = mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + mdl_I80x36 = mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + mdl_I80x63 = mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + mdl_I80x66 = mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + mdl_I81x33 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + mdl_I81x36 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + mdl_I81x63 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + mdl_I81x66 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + mdl_I82x33 = mdl_CKM3x3 * mdl_conjg__Rd3x3; + mdl_I82x36 = mdl_CKM3x3 * mdl_conjg__Rd6x3; + mdl_I83x33 = mdl_CKM3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + mdl_I83x36 = mdl_CKM3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + mdl_I84x33 = mdl_CKM3x3 * mdl_yu3x3 * mdl_conjg__Rd3x3; + mdl_I84x36 = mdl_CKM3x3 * mdl_yu3x3 * mdl_conjg__Rd6x3; + mdl_I85x33 = mdl_conjg__Rl3x3; + mdl_I85x36 = mdl_conjg__Rl6x3; + mdl_I86x33 = mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + mdl_I86x36 = mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + mdl_I88x33 = mdl_ye3x3 * mdl_conjg__Rn3x3; + mdl_I89x33 = mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + mdl_I89x36 = mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + mdl_I9x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3; + mdl_I9x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3; + mdl_I90x33 = mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + mdl_I90x36 = mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + mdl_I91x33 = mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + mdl_I91x36 = mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + mdl_I92x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x3; + mdl_I92x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x3; + mdl_I92x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x3; + mdl_I92x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x3; + mdl_I93x33 = mdl_Rn3x3 * mdl_conjg__Rl3x3; + mdl_I93x36 = mdl_Rn3x3 * mdl_conjg__Rl6x3; + mdl_I94x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + mdl_I94x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + mdl_I94x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + mdl_I94x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + mdl_I95x33 = mdl_Rl3x3 * mdl_conjg__Rn3x3; + mdl_I95x36 = mdl_Rl6x3 * mdl_conjg__Rn3x3; + mdl_I96x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; + mdl_I96x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; + mdl_I96x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; + mdl_I96x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; + mdl_I97x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; + mdl_I97x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; + mdl_I97x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; + mdl_I97x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; + mdl_I98x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; + mdl_I98x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; + mdl_I98x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; + mdl_I98x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; + mdl_I99x33 = mdl_ye3x3; + mdl_complexi = cxsmpl( 0., 1. ); + mdl_sqrt__2 = sqrt( 2. ); + mdl_sw__exp__2 = ( ( mdl_sw ) * ( mdl_sw ) ); + mdl_conjg__NN1x1 = conj( mdl_NN1x1 ); + mdl_conjg__NN1x2 = conj( mdl_NN1x2 ); + mdl_conjg__NN1x3 = conj( mdl_NN1x3 ); + mdl_conjg__NN1x4 = conj( mdl_NN1x4 ); + mdl_conjg__NN2x1 = conj( mdl_NN2x1 ); + mdl_conjg__NN2x2 = conj( mdl_NN2x2 ); + mdl_conjg__NN2x3 = conj( mdl_NN2x3 ); + mdl_conjg__NN2x4 = conj( mdl_NN2x4 ); + mdl_conjg__NN3x1 = conj( mdl_NN3x1 ); + mdl_conjg__NN3x2 = conj( mdl_NN3x2 ); + mdl_conjg__NN3x3 = conj( mdl_NN3x3 ); + mdl_conjg__NN3x4 = conj( mdl_NN3x4 ); + mdl_conjg__NN4x1 = conj( mdl_NN4x1 ); + mdl_conjg__NN4x2 = conj( mdl_NN4x2 ); + mdl_conjg__NN4x3 = conj( mdl_NN4x3 ); + mdl_conjg__NN4x4 = conj( mdl_NN4x4 ); + mdl_conjg__UU1x1 = conj( mdl_UU1x1 ); + mdl_conjg__UU1x2 = conj( mdl_UU1x2 ); + mdl_conjg__UU2x1 = conj( mdl_UU2x1 ); + mdl_conjg__UU2x2 = conj( mdl_UU2x2 ); + mdl_conjg__VV1x1 = conj( mdl_VV1x1 ); + mdl_conjg__VV1x2 = conj( mdl_VV1x2 ); + mdl_conjg__VV2x1 = conj( mdl_VV2x1 ); + mdl_conjg__VV2x2 = conj( mdl_VV2x2 ); + mdl_cos__alp = cos( mdl_alp ); + mdl_sin__alp = sin( mdl_alp ); + mdl_conjg__MUH = conj( mdl_MUH ); + mdl_ee = 2. * sqrt( 1. / aEWM1 ) * sqrt( M_PI ); + mdl_gp = mdl_ee / mdl_cw; + mdl_gw = mdl_ee / mdl_sw; + mdl_vev = ( 2. * mdl_cw * mdl_MZ * mdl_sw ) / mdl_ee; + mdl_vd = mdl_vev * mdl_cos__beta; + mdl_vu = mdl_vev * mdl_sin__beta; + mdl_ee__exp__2 = ( ( mdl_ee ) * ( mdl_ee ) ); +} + +void +Parameters_MSSM_SLHA2::setIndependentCouplings() +{ + // (none) +} + +/* +void +Parameters_MSSM_SLHA2::setDependentParameters() // now computed event-by-event (running alphas #373) +{ + mdl_sqrt__aS = sqrt( aS ); + G = 2. * mdl_sqrt__aS * sqrt( M_PI ); + mdl_G__exp__2 = ( ( G ) * ( G ) ); +} + +void +Parameters_MSSM_SLHA2::setDependentCouplings() // now computed event-by-event (running alphas #373) +{ + GC_6 = -G; + GC_51 = -( mdl_complexi * G * mdl_I51x11 ); +} +*/ + +#endif + +// Routines for printing out parameters +void +Parameters_MSSM_SLHA2::printIndependentParameters() +{ + std::cout << "MSSM_SLHA2 model parameters independent of event kinematics:" << std::endl; + std::cout << "(Warning: aS in the runcard is ignored because event-by-event Gs are hardcoded or retrieved from Fortran)" << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsl6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsl6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsl5 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsl5 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsl4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsl4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsu6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsu6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsd6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsd6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsu5 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsu5 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsd5 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsd5 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsu4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsu4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsd4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsd4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wch2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wch2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wneu4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wneu4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wneu3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wneu3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wch1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wch1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wneu2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wneu2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wgo = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wgo << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsn3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsn3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsl3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsl3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsn2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsn2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsl2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsl2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsn1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsn1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsl1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsl1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsu3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsu3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsd3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsd3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsu2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsu2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsd2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsd2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsu1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsu1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Wsd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Wsd1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_WH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WH << std::endl; + std::cout << std::setw( 20 ) << "mdl_WA0 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WA0 << std::endl; + std::cout << std::setw( 20 ) << "mdl_WH02 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WH02 << std::endl; + std::cout << std::setw( 20 ) << "mdl_WH01 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WH01 << std::endl; + std::cout << std::setw( 20 ) << "mdl_WW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WW << std::endl; + std::cout << std::setw( 20 ) << "mdl_WZ = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WZ << std::endl; + std::cout << std::setw( 20 ) << "mdl_WT = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_WT << std::endl; + std::cout << std::setw( 20 ) << "mdl_Ryu3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Ryu3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rye3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rye3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Ryd3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Ryd3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RVV2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RVV2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RVV2x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RVV2x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RVV1x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RVV1x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RVV1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RVV1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RCKM3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RCKM3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RCKM2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RCKM2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RCKM1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RCKM1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRu6x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRu6x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRu6x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRu6x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRu5x5 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRu5x5 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRu4x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRu4x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRu3x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRu3x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRu3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRu3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRu2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRu2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRu1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRu1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RMNS3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RMNS3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RMNS2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RMNS2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RMNS1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RMNS1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RUU2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RUU2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RUU2x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RUU2x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RUU1x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RUU1x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RUU1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RUU1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rtu3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rtu3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rte3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rte3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rtd3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rtd3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRn3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRn3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRn2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRn2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRn1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRn1x1 << std::endl; + //std::cout << std::setw( 20 ) << "aS = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << aS << std::endl; // now retrieved event-by-event (as G) from Fortran (running alphas #373) + std::cout << std::setw( 20 ) << "aEWM1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << aEWM1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRl6x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRl6x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRl6x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRl6x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRl5x5 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRl5x5 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRl4x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRl4x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRl3x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRl3x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRl3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRl3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRl2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRl2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRl1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRl1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN4x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN4x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN4x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN4x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN4x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN4x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN4x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN4x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN3x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN3x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN3x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN3x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN3x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN3x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN2x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN2x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN2x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN2x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN2x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN2x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN1x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN1x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN1x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN1x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN1x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN1x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RNN1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RNN1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmU23x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmU23x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmU21x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmU21x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmQ23x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmQ23x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmQ21x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmQ21x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mHu2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mHu2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mHd2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mHd2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RMx3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RMx3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RMx2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RMx2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RMx1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RMx1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmL23x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmL23x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmL21x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmL21x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmE23x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmE23x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmE21x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmE21x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmD23x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmD23x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmD21x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmD21x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msl6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msl6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msl4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msl4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msu6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msu6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msd6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msd6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msu4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msu4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msd4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msd4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Mch2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Mch2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Mneu4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Mneu4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Mneu3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Mneu3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Mch1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Mch1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Mneu2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Mneu2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Mneu1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Mneu1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Mgo = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Mgo << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msn3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msn3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msl3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msl3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msn1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msn1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msl1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msl1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msu3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msu3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msd3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msd3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msu1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msu1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msd1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msd1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_MH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MH << std::endl; + std::cout << std::setw( 20 ) << "mdl_MA0 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MA0 << std::endl; + std::cout << std::setw( 20 ) << "mdl_MH02 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MH02 << std::endl; + std::cout << std::setw( 20 ) << "mdl_MH01 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MH01 << std::endl; + std::cout << std::setw( 20 ) << "mdl_MW = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MW << std::endl; + std::cout << std::setw( 20 ) << "mdl_MZ = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MZ << std::endl; + std::cout << std::setw( 20 ) << "mdl_Mta = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Mta << std::endl; + std::cout << std::setw( 20 ) << "mdl_MT = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MT << std::endl; + std::cout << std::setw( 20 ) << "mdl_MB = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MB << std::endl; + std::cout << std::setw( 20 ) << "mdl_MA2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MA2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_tb = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_tb << std::endl; + std::cout << std::setw( 20 ) << "mdl_RMUH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RMUH << std::endl; + std::cout << std::setw( 20 ) << "mdl_alp = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_alp << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRd6x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRd6x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRd6x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRd6x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRd5x5 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRd5x5 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRd4x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRd4x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRd3x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRd3x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRd3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRd3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRd2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRd2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RRd1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RRd1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msd5 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msd5 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msd2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msd2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msu5 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msu5 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msu2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msu2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msl5 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msl5 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msl2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msl2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Msn2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Msn2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmU22x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmU22x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmQ22x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmQ22x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmL22x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmL22x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmE22x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmE22x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_RmD22x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_RmD22x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Rn3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Rn3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__CKM3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__CKM3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Ru4x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Ru4x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Ru1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Ru1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rn3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rn3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rn1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rn1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rl4x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rl4x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rl1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rl1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rd4x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rd4x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rd1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rd1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I98x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I98x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I97x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I97x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I96x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I96x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I93x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I93x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I92x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I92x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I87x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I87x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I82x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I82x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I74x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I74x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I6x44 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I6x44 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I5x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I5x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I53x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I53x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I52x44 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I52x44 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I51x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I51x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I39x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I39x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I31x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I31x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I26x44 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I26x44 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I25x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I25x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I12x11 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I12x11 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I102x44 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I102x44 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I101x44 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I101x44 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I100x44 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I100x44 << std::endl; + std::cout << std::setw( 20 ) << "mdl_CKM3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_CKM3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_atan__tb = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_atan__tb << std::endl; + std::cout << std::setw( 20 ) << "mdl_beta = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_beta << std::endl; + std::cout << std::setw( 20 ) << "mdl_cw = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cw << std::endl; + std::cout << std::setw( 20 ) << "mdl_mD21x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mD21x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mD22x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mD22x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mD23x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mD23x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mE21x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mE21x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mE22x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mE22x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mE23x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mE23x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mL21x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mL21x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mL22x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mL22x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mL23x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mL23x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mQ21x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mQ21x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mQ22x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mQ22x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mQ23x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mQ23x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mU21x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mU21x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mU22x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mU22x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_mU23x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_mU23x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_MUH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MUH << std::endl; + std::cout << std::setw( 20 ) << "mdl_Mx1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Mx1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Mx2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Mx2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Mx3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Mx3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN1x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN1x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN1x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN1x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN1x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN1x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN2x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN2x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN2x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN2x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN2x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN2x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN3x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN3x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN3x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN3x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN3x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN3x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN4x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN4x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN4x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN4x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN4x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN4x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_NN4x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_NN4x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rd3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rd3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rd3x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rd3x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rd6x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rd6x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rd6x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rd6x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rl3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rl3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rl3x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rl3x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rl6x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rl6x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Rl6x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Rl6x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Ru3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Ru3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Ru3x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Ru3x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Ru6x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Ru6x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_Ru6x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_Ru6x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_UU1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_UU1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_UU1x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_UU1x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_UU2x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_UU2x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_UU2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_UU2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_VV1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_VV1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_VV1x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_VV1x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_VV2x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_VV2x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_VV2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_VV2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_td3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_td3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_te3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_te3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_tu3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_tu3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_yd3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_yd3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_ye3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ye3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_yu3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_yu3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_MZ__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_MZ__exp__2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_bb = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_bb << std::endl; + std::cout << std::setw( 20 ) << "mdl_cw__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cw__exp__2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_sw = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sw << std::endl; + std::cout << std::setw( 20 ) << "mdl_cos__beta = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cos__beta << std::endl; + std::cout << std::setw( 20 ) << "mdl_sin__beta = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sin__beta << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__yu3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__yu3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I1x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I1x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__yd3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__yd3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I10x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I10x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I10x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I10x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Rd3x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Rd3x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I100x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I100x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I100x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I100x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Rd6x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Rd6x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I100x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I100x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I100x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I100x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Rl3x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Rl3x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I101x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I101x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I101x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I101x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Rl6x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Rl6x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I101x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I101x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I101x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I101x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Ru3x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Ru3x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I102x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I102x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I102x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I102x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Ru6x6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Ru6x6 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I102x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I102x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I102x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I102x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I11x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I11x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I11x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I11x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Rd3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Rd3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I12x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I12x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I12x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I12x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Rd6x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Rd6x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I12x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I12x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I12x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I12x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I13x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I13x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I13x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I13x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I13x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I13x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I13x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I13x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__td3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__td3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I14x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I14x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I14x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I14x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I14x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I14x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I14x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I14x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I15x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I15x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I15x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I15x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I15x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I15x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I15x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I15x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I16x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I16x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I16x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I16x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I16x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I16x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I16x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I16x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I17x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I17x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I17x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I17x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I17x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I17x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I17x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I17x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I18x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I18x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I18x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I18x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I18x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I18x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I18x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I18x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I19x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I19x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I19x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I19x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I19x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I19x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I19x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I19x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I2x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I2x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I20x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I20x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I21x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I21x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__ye3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__ye3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I22x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I22x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I23x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I23x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I23x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I23x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Rl3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Rl3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I24x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I24x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Rl6x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Rl6x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I24x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I24x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I25x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I25x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I25x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I25x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I25x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I25x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I25x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I25x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I26x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I26x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I26x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I26x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I26x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I26x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I26x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I26x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I27x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I27x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I27x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I27x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I28x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I28x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I28x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I28x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I29x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I29x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I29x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I29x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I3x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I3x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I3x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I3x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I30x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I30x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I30x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I30x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I31x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I31x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I31x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I31x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I31x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I31x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I31x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I31x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I32x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I32x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I32x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I32x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I32x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I32x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I32x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I32x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__te3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__te3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I33x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I33x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I33x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I33x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I33x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I33x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I33x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I33x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I34x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I34x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I34x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I34x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I34x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I34x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I34x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I34x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I35x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I35x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I35x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I35x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I35x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I35x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I35x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I35x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I36x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I36x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I36x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I36x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I36x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I36x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I36x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I36x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I37x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I37x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I37x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I37x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I37x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I37x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I37x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I37x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I38x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I38x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I38x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I38x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I38x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I38x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I38x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I38x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I39x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I39x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I39x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I39x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I4x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I4x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I4x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I4x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I40x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I40x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I40x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I40x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I41x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I41x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I41x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I41x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I42x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I42x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I42x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I42x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I44x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I44x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I45x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I45x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I45x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I45x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I46x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I46x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I46x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I46x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I47x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I47x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I47x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I47x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I48x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I48x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I48x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I48x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I49x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I49x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I49x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I49x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I5x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I5x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I5x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I5x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I5x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I5x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I5x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I5x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Ru3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Ru3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I50x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I50x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__Ru6x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__Ru6x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I50x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I50x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I51x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I51x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I51x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I51x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I51x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I51x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I51x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I51x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I52x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I52x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I52x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I52x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I52x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I52x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I52x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I52x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I53x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I53x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I53x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I53x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I53x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I53x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I53x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I53x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__tu3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__tu3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I54x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I54x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I54x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I54x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I54x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I54x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I54x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I54x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I55x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I55x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I55x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I55x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I55x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I55x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I55x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I55x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I56x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I56x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I56x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I56x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I56x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I56x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I56x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I56x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I57x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I57x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I57x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I57x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I57x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I57x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I57x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I57x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I58x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I58x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I58x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I58x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I58x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I58x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I58x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I58x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I59x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I59x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I59x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I59x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I59x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I59x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I59x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I59x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I6x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I6x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I6x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I6x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I6x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I6x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I6x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I6x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I60x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I60x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I60x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I60x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I60x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I60x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I60x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I60x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I61x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I61x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I61x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I61x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I62x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I62x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I62x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I62x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I63x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I63x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I63x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I63x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I64x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I64x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I64x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I64x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I65x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I65x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I65x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I65x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I66x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I66x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I66x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I66x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I66x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I66x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I66x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I66x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I67x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I67x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I67x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I67x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I67x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I67x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I67x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I67x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I68x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I68x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I68x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I68x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I68x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I68x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I68x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I68x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I69x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I69x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I69x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I69x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I69x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I69x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I69x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I69x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I7x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I7x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I7x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I7x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I70x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I70x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I70x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I70x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I70x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I70x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I70x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I70x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I71x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I71x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I71x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I71x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I71x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I71x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I71x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I71x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I72x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I72x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I72x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I72x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I72x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I72x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I72x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I72x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I73x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I73x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I73x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I73x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I73x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I73x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I73x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I73x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I74x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I74x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I74x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I74x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I74x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I74x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I74x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I74x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I75x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I75x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I75x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I75x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I75x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I75x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I75x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I75x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I76x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I76x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I76x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I76x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I76x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I76x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I76x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I76x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I77x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I77x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I77x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I77x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I77x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I77x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I77x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I77x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I78x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I78x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I78x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I78x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I78x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I78x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I78x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I78x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I79x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I79x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I79x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I79x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I79x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I79x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I79x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I79x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I8x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I8x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I8x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I8x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I80x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I80x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I80x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I80x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I80x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I80x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I80x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I80x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I81x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I81x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I81x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I81x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I81x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I81x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I81x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I81x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I82x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I82x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I82x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I82x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I83x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I83x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I83x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I83x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I84x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I84x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I84x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I84x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I85x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I85x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I85x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I85x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I86x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I86x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I86x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I86x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I88x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I88x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I89x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I89x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I89x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I89x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I9x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I9x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I9x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I9x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I90x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I90x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I90x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I90x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I91x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I91x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I91x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I91x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I92x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I92x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I92x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I92x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I92x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I92x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I92x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I92x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I93x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I93x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I93x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I93x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I94x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I94x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I94x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I94x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I94x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I94x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I94x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I94x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I95x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I95x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I95x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I95x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I96x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I96x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I96x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I96x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I96x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I96x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I96x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I96x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I97x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I97x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I97x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I97x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I97x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I97x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I97x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I97x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I98x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I98x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I98x36 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I98x36 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I98x63 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I98x63 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I98x66 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I98x66 << std::endl; + std::cout << std::setw( 20 ) << "mdl_I99x33 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_I99x33 << std::endl; + std::cout << std::setw( 20 ) << "mdl_complexi = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_complexi << std::endl; + std::cout << std::setw( 20 ) << "mdl_sqrt__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sqrt__2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_sw__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sw__exp__2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN1x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN1x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN1x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN1x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN1x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN1x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN2x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN2x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN2x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN2x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN2x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN2x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN3x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN3x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN3x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN3x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN3x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN3x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN3x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN3x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN4x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN4x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN4x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN4x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN4x3 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN4x3 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__NN4x4 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__NN4x4 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__UU1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__UU1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__UU1x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__UU1x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__UU2x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__UU2x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__UU2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__UU2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__VV1x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__VV1x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__VV1x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__VV1x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__VV2x1 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__VV2x1 << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__VV2x2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__VV2x2 << std::endl; + std::cout << std::setw( 20 ) << "mdl_cos__alp = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_cos__alp << std::endl; + std::cout << std::setw( 20 ) << "mdl_sin__alp = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sin__alp << std::endl; + std::cout << std::setw( 20 ) << "mdl_conjg__MUH = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_conjg__MUH << std::endl; + std::cout << std::setw( 20 ) << "mdl_ee = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ee << std::endl; + std::cout << std::setw( 20 ) << "mdl_gp = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_gp << std::endl; + std::cout << std::setw( 20 ) << "mdl_gw = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_gw << std::endl; + std::cout << std::setw( 20 ) << "mdl_vev = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_vev << std::endl; + std::cout << std::setw( 20 ) << "mdl_vd = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_vd << std::endl; + std::cout << std::setw( 20 ) << "mdl_vu = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_vu << std::endl; + std::cout << std::setw( 20 ) << "mdl_ee__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ee__exp__2 << std::endl; + if( mdl_Mneu2 < 0 ) + mdl_Wneu2 = -abs( mdl_Wneu2 ); + if( mdl_Mneu3 < 0 ) + mdl_Wneu3 = -abs( mdl_Wneu3 ); + if( mdl_Mneu4 < 0 ) + mdl_Wneu4 = -abs( mdl_Wneu4 ); + if( mdl_Mgo < 0 ) + mdl_Wgo = -abs( mdl_Wgo ); +} + +void +Parameters_MSSM_SLHA2::printIndependentCouplings() +{ + std::cout << "MSSM_SLHA2 model couplings independent of event kinematics:" << std::endl; + // (none) +} + +/* +void +Parameters_MSSM_SLHA2::printDependentParameters() // now computed event-by-event (running alphas #373) +{ + std::cout << "MSSM_SLHA2 model parameters dependent on event kinematics:" << std::endl; + std::cout << std::setw( 20 ) << "mdl_sqrt__aS = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_sqrt__aS << std::endl; + std::cout << std::setw( 20 ) << "G = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << G << std::endl; + std::cout << std::setw( 20 ) << "mdl_G__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_G__exp__2 << std::endl; +} + +void +Parameters_MSSM_SLHA2::printDependentCouplings() // now computed event-by-event (running alphas #373) +{ + std::cout << "MSSM_SLHA2 model couplings dependent on event kinematics:" << std::endl; + std::cout << std::setw( 20 ) << "GC_6 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << GC_6 << std::endl; + std::cout << std::setw( 20 ) << "GC_51 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << GC_51 << std::endl; +} +*/ diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h new file mode 100644 index 0000000000..263590e463 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -0,0 +1,890 @@ +//========================================================================== +// This file has been automatically generated for CUDA/C++ standalone by +// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26 +// By the MadGraph5_aMC@NLO Development Team +// Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch +//========================================================================== + +#ifndef Parameters_MSSM_SLHA2_H +#define Parameters_MSSM_SLHA2_H + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuCxtypes.h" +#include "mgOnGpuVectors.h" + +//========================================================================== + +#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1" + +#include "read_slha.h" + +class Parameters_MSSM_SLHA2 +{ +public: + + static Parameters_MSSM_SLHA2* getInstance(); + + // Define "zero" + double zero, ZERO; + + // Model parameters independent of aS + //double aS; // now retrieved event-by-event (as G) from Fortran (running alphas #373) + double mdl_Wsl6, mdl_Wsl5, mdl_Wsl4, mdl_Wsu6, mdl_Wsd6, mdl_Wsu5, mdl_Wsd5, mdl_Wsu4, mdl_Wsd4, mdl_Wch2, mdl_Wneu4, mdl_Wneu3, mdl_Wch1, mdl_Wneu2, mdl_Wgo, mdl_Wsn3, mdl_Wsl3, mdl_Wsn2, mdl_Wsl2, mdl_Wsn1, mdl_Wsl1, mdl_Wsu3, mdl_Wsd3, mdl_Wsu2, mdl_Wsd2, mdl_Wsu1, mdl_Wsd1, mdl_WH, mdl_WA0, mdl_WH02, mdl_WH01, mdl_WW, mdl_WZ, mdl_WT, mdl_Ryu3x3, mdl_Rye3x3, mdl_Ryd3x3, mdl_RVV2x2, mdl_RVV2x1, mdl_RVV1x2, mdl_RVV1x1, mdl_RCKM3x3, mdl_RCKM2x2, mdl_RCKM1x1, mdl_RRu6x6, mdl_RRu6x3, mdl_RRu5x5, mdl_RRu4x4, mdl_RRu3x6, mdl_RRu3x3, mdl_RRu2x2, mdl_RRu1x1, mdl_RMNS3x3, mdl_RMNS2x2, mdl_RMNS1x1, mdl_RUU2x2, mdl_RUU2x1, mdl_RUU1x2, mdl_RUU1x1, mdl_Rtu3x3, mdl_Rte3x3, mdl_Rtd3x3, mdl_RRn3x3, mdl_RRn2x2, mdl_RRn1x1, aEWM1, mdl_RRl6x6, mdl_RRl6x3, mdl_RRl5x5, mdl_RRl4x4, mdl_RRl3x6, mdl_RRl3x3, mdl_RRl2x2, mdl_RRl1x1, mdl_RNN4x4, mdl_RNN4x3, mdl_RNN4x2, mdl_RNN4x1, mdl_RNN3x4, mdl_RNN3x3, mdl_RNN3x2, mdl_RNN3x1, mdl_RNN2x4, mdl_RNN2x3, mdl_RNN2x2, mdl_RNN2x1, mdl_RNN1x4, mdl_RNN1x3, mdl_RNN1x2, mdl_RNN1x1, mdl_RmU23x3, mdl_RmU21x1, mdl_RmQ23x3, mdl_RmQ21x1, mdl_mHu2, mdl_mHd2, mdl_RMx3, mdl_RMx2, mdl_RMx1, mdl_RmL23x3, mdl_RmL21x1, mdl_RmE23x3, mdl_RmE21x1, mdl_RmD23x3, mdl_RmD21x1, mdl_Msl6, mdl_Msl4, mdl_Msu6, mdl_Msd6, mdl_Msu4, mdl_Msd4, mdl_Mch2, mdl_Mneu4, mdl_Mneu3, mdl_Mch1, mdl_Mneu2, mdl_Mneu1, mdl_Mgo, mdl_Msn3, mdl_Msl3, mdl_Msn1, mdl_Msl1, mdl_Msu3, mdl_Msd3, mdl_Msu1, mdl_Msd1, mdl_MH, mdl_MA0, mdl_MH02, mdl_MH01, mdl_MW, mdl_MZ, mdl_Mta, mdl_MT, mdl_MB, mdl_MA2, mdl_tb, mdl_RMUH, mdl_alp, mdl_RRd6x6, mdl_RRd6x3, mdl_RRd5x5, mdl_RRd4x4, mdl_RRd3x6, mdl_RRd3x3, mdl_RRd2x2, mdl_RRd1x1, mdl_Msd5, mdl_Msd2, mdl_Msu5, mdl_Msu2, mdl_Msl5, mdl_Msl2, mdl_Msn2, mdl_RmU22x2, mdl_RmQ22x2, mdl_RmL22x2, mdl_RmE22x2, mdl_RmD22x2, mdl_conjg__Rn3x3, mdl_conjg__CKM3x3, mdl_Ru4x4, mdl_Ru1x1, mdl_Rn3x3, mdl_Rn1x1, mdl_Rl4x4, mdl_Rl1x1, mdl_Rd4x4, mdl_Rd1x1, mdl_I98x11, mdl_I97x11, mdl_I96x11, mdl_I93x11, mdl_I92x11, mdl_I87x11, mdl_I82x11, mdl_I74x11, mdl_I6x44, mdl_I5x11, mdl_I53x11, mdl_I52x44, mdl_I51x11, mdl_I39x11, mdl_I31x11, mdl_I26x44, mdl_I25x11, mdl_I12x11, mdl_I102x44, mdl_I101x44, mdl_I100x44, mdl_CKM3x3, mdl_atan__tb, mdl_beta, mdl_cw, mdl_MZ__exp__2, mdl_cw__exp__2, mdl_sw, mdl_cos__beta, mdl_sin__beta, mdl_sqrt__2, mdl_sw__exp__2, mdl_cos__alp, mdl_sin__alp, mdl_ee, mdl_gp, mdl_gw, mdl_vev, mdl_vd, mdl_vu, mdl_ee__exp__2; + cxsmpl mdl_mD21x1, mdl_mD22x2, mdl_mD23x3, mdl_mE21x1, mdl_mE22x2, mdl_mE23x3, mdl_mL21x1, mdl_mL22x2, mdl_mL23x3, mdl_mQ21x1, mdl_mQ22x2, mdl_mQ23x3, mdl_mU21x1, mdl_mU22x2, mdl_mU23x3, mdl_MUH, mdl_Mx1, mdl_Mx2, mdl_Mx3, mdl_NN1x1, mdl_NN1x2, mdl_NN1x3, mdl_NN1x4, mdl_NN2x1, mdl_NN2x2, mdl_NN2x3, mdl_NN2x4, mdl_NN3x1, mdl_NN3x2, mdl_NN3x3, mdl_NN3x4, mdl_NN4x1, mdl_NN4x2, mdl_NN4x3, mdl_NN4x4, mdl_Rd3x3, mdl_Rd3x6, mdl_Rd6x3, mdl_Rd6x6, mdl_Rl3x3, mdl_Rl3x6, mdl_Rl6x3, mdl_Rl6x6, mdl_Ru3x3, mdl_Ru3x6, mdl_Ru6x3, mdl_Ru6x6, mdl_UU1x1, mdl_UU1x2, mdl_UU2x1, mdl_UU2x2, mdl_VV1x1, mdl_VV1x2, mdl_VV2x1, mdl_VV2x2, mdl_td3x3, mdl_te3x3, mdl_tu3x3, mdl_yd3x3, mdl_ye3x3, mdl_yu3x3, mdl_bb, mdl_conjg__yu3x3, mdl_I1x33, mdl_conjg__yd3x3, mdl_I10x33, mdl_I10x36, mdl_conjg__Rd3x6, mdl_I100x33, mdl_I100x36, mdl_conjg__Rd6x6, mdl_I100x63, mdl_I100x66, mdl_conjg__Rl3x6, mdl_I101x33, mdl_I101x36, mdl_conjg__Rl6x6, mdl_I101x63, mdl_I101x66, mdl_conjg__Ru3x6, mdl_I102x33, mdl_I102x36, mdl_conjg__Ru6x6, mdl_I102x63, mdl_I102x66, mdl_I11x33, mdl_I11x36, mdl_conjg__Rd3x3, mdl_I12x33, mdl_I12x36, mdl_conjg__Rd6x3, mdl_I12x63, mdl_I12x66, mdl_I13x33, mdl_I13x36, mdl_I13x63, mdl_I13x66, mdl_conjg__td3x3, mdl_I14x33, mdl_I14x36, mdl_I14x63, mdl_I14x66, mdl_I15x33, mdl_I15x36, mdl_I15x63, mdl_I15x66, mdl_I16x33, mdl_I16x36, mdl_I16x63, mdl_I16x66, mdl_I17x33, mdl_I17x36, mdl_I17x63, mdl_I17x66, mdl_I18x33, mdl_I18x36, mdl_I18x63, mdl_I18x66, mdl_I19x33, mdl_I19x36, mdl_I19x63, mdl_I19x66, mdl_I2x33, mdl_I20x33, mdl_I21x33, mdl_conjg__ye3x3, mdl_I22x33, mdl_I23x33, mdl_I23x36, mdl_conjg__Rl3x3, mdl_I24x33, mdl_conjg__Rl6x3, mdl_I24x36, mdl_I25x33, mdl_I25x36, mdl_I25x63, mdl_I25x66, mdl_I26x33, mdl_I26x36, mdl_I26x63, mdl_I26x66, mdl_I27x33, mdl_I27x36, mdl_I28x33, mdl_I28x36, mdl_I29x33, mdl_I29x36, mdl_I3x33, mdl_I3x36, mdl_I30x33, mdl_I30x36, mdl_I31x33, mdl_I31x36, mdl_I31x63, mdl_I31x66, mdl_I32x33, mdl_I32x36, mdl_I32x63, mdl_I32x66, mdl_conjg__te3x3, mdl_I33x33, mdl_I33x36, mdl_I33x63, mdl_I33x66, mdl_I34x33, mdl_I34x36, mdl_I34x63, mdl_I34x66, mdl_I35x33, mdl_I35x36, mdl_I35x63, mdl_I35x66, mdl_I36x33, mdl_I36x36, mdl_I36x63, mdl_I36x66, mdl_I37x33, mdl_I37x36, mdl_I37x63, mdl_I37x66, mdl_I38x33, mdl_I38x36, mdl_I38x63, mdl_I38x66, mdl_I39x33, mdl_I39x36, mdl_I4x33, mdl_I4x36, mdl_I40x33, mdl_I40x36, mdl_I41x33, mdl_I41x36, mdl_I42x33, mdl_I42x36, mdl_I44x33, mdl_I45x33, mdl_I45x36, mdl_I46x33, mdl_I46x36, mdl_I47x33, mdl_I47x36, mdl_I48x33, mdl_I48x36, mdl_I49x33, mdl_I49x36, mdl_I5x33, mdl_I5x36, mdl_I5x63, mdl_I5x66, mdl_conjg__Ru3x3, mdl_I50x33, mdl_conjg__Ru6x3, mdl_I50x36, mdl_I51x33, mdl_I51x36, mdl_I51x63, mdl_I51x66, mdl_I52x33, mdl_I52x36, mdl_I52x63, mdl_I52x66, mdl_I53x33, mdl_I53x36, mdl_I53x63, mdl_I53x66, mdl_conjg__tu3x3, mdl_I54x33, mdl_I54x36, mdl_I54x63, mdl_I54x66, mdl_I55x33, mdl_I55x36, mdl_I55x63, mdl_I55x66, mdl_I56x33, mdl_I56x36, mdl_I56x63, mdl_I56x66, mdl_I57x33, mdl_I57x36, mdl_I57x63, mdl_I57x66, mdl_I58x33, mdl_I58x36, mdl_I58x63, mdl_I58x66, mdl_I59x33, mdl_I59x36, mdl_I59x63, mdl_I59x66, mdl_I6x33, mdl_I6x36, mdl_I6x63, mdl_I6x66, mdl_I60x33, mdl_I60x36, mdl_I60x63, mdl_I60x66, mdl_I61x33, mdl_I61x36, mdl_I62x33, mdl_I62x36, mdl_I63x33, mdl_I63x36, mdl_I64x33, mdl_I64x36, mdl_I65x33, mdl_I65x36, mdl_I66x33, mdl_I66x36, mdl_I66x63, mdl_I66x66, mdl_I67x33, mdl_I67x36, mdl_I67x63, mdl_I67x66, mdl_I68x33, mdl_I68x36, mdl_I68x63, mdl_I68x66, mdl_I69x33, mdl_I69x36, mdl_I69x63, mdl_I69x66, mdl_I7x33, mdl_I7x36, mdl_I70x33, mdl_I70x36, mdl_I70x63, mdl_I70x66, mdl_I71x33, mdl_I71x36, mdl_I71x63, mdl_I71x66, mdl_I72x33, mdl_I72x36, mdl_I72x63, mdl_I72x66, mdl_I73x33, mdl_I73x36, mdl_I73x63, mdl_I73x66, mdl_I74x33, mdl_I74x36, mdl_I74x63, mdl_I74x66, mdl_I75x33, mdl_I75x36, mdl_I75x63, mdl_I75x66, mdl_I76x33, mdl_I76x36, mdl_I76x63, mdl_I76x66, mdl_I77x33, mdl_I77x36, mdl_I77x63, mdl_I77x66, mdl_I78x33, mdl_I78x36, mdl_I78x63, mdl_I78x66, mdl_I79x33, mdl_I79x36, mdl_I79x63, mdl_I79x66, mdl_I8x33, mdl_I8x36, mdl_I80x33, mdl_I80x36, mdl_I80x63, mdl_I80x66, mdl_I81x33, mdl_I81x36, mdl_I81x63, mdl_I81x66, mdl_I82x33, mdl_I82x36, mdl_I83x33, mdl_I83x36, mdl_I84x33, mdl_I84x36, mdl_I85x33, mdl_I85x36, mdl_I86x33, mdl_I86x36, mdl_I88x33, mdl_I89x33, mdl_I89x36, mdl_I9x33, mdl_I9x36, mdl_I90x33, mdl_I90x36, mdl_I91x33, mdl_I91x36, mdl_I92x33, mdl_I92x36, mdl_I92x63, mdl_I92x66, mdl_I93x33, mdl_I93x36, mdl_I94x33, mdl_I94x36, mdl_I94x63, mdl_I94x66, mdl_I95x33, mdl_I95x36, mdl_I96x33, mdl_I96x36, mdl_I96x63, mdl_I96x66, mdl_I97x33, mdl_I97x36, mdl_I97x63, mdl_I97x66, mdl_I98x33, mdl_I98x36, mdl_I98x63, mdl_I98x66, mdl_I99x33, mdl_complexi, mdl_conjg__NN1x1, mdl_conjg__NN1x2, mdl_conjg__NN1x3, mdl_conjg__NN1x4, mdl_conjg__NN2x1, mdl_conjg__NN2x2, mdl_conjg__NN2x3, mdl_conjg__NN2x4, mdl_conjg__NN3x1, mdl_conjg__NN3x2, mdl_conjg__NN3x3, mdl_conjg__NN3x4, mdl_conjg__NN4x1, mdl_conjg__NN4x2, mdl_conjg__NN4x3, mdl_conjg__NN4x4, mdl_conjg__UU1x1, mdl_conjg__UU1x2, mdl_conjg__UU2x1, mdl_conjg__UU2x2, mdl_conjg__VV1x1, mdl_conjg__VV1x2, mdl_conjg__VV2x1, mdl_conjg__VV2x2, mdl_conjg__MUH; + + // Model couplings independent of aS + // (none) + + // Model parameters dependent on aS + //double mdl_sqrt__aS, G; // now computed event-by-event (running alphas #373) + //cxsmpl mdl_G__exp__2; // now computed event-by-event (running alphas #373) + + // Model couplings dependent on aS + //cxsmpl GC_6, GC_51; // now computed event-by-event (running alphas #373) + + // Set parameters that are unchanged during the run + void setIndependentParameters( SLHAReader& slha ); + + // Set couplings that are unchanged during the run + void setIndependentCouplings(); + + // Set parameters that are changed event by event + //void setDependentParameters(); // now computed event-by-event (running alphas #373) + + // Set couplings that are changed event by event + //void setDependentCouplings(); // now computed event-by-event (running alphas #373) + + // Print parameters that are unchanged during the run + void printIndependentParameters(); + + // Print couplings that are unchanged during the run + void printIndependentCouplings(); + + // Print parameters that are changed event by event + //void printDependentParameters(); // now computed event-by-event (running alphas #373) + + // Print couplings that are changed event by event + //void printDependentCouplings(); // now computed event-by-event (running alphas #373) + +private: + + static Parameters_MSSM_SLHA2* instance; +}; + +#else + +#include +#include + +// Hardcoded constexpr physics parameters +namespace Parameters_MSSM_SLHA2 // keep the same name rather than HardcodedParameters_MSSM_SLHA2 for simplicity +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); + } + double constexpr constexpr_sqrt( double x ) + { + return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( x, x, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( double d ) + { + const int i = static_cast( d ); + return d < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr double constexpr_pow( double base, double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // Model parameters independent of aS + constexpr double zero = 0; + constexpr double ZERO = 0; + constexpr double mdl_Wsl6 = 2.699061e-01; + constexpr double mdl_Wsl5 = 2.161216e-01; + constexpr double mdl_Wsl4 = 2.161216e-01; + constexpr double mdl_Wsu6 = 7.373133e+00; + constexpr double mdl_Wsd6 = 8.015663e-01; + constexpr double mdl_Wsu5 = 1.152973e+00; + constexpr double mdl_Wsd5 = 2.858123e-01; + constexpr double mdl_Wsu4 = 1.152973e+00; + constexpr double mdl_Wsd4 = 2.858123e-01; + constexpr double mdl_Wch2 = 2.486895e+00; + constexpr double mdl_Wneu4 = 2.585851e+00; + constexpr double mdl_Wneu3 = 1.915985e+00; + constexpr double mdl_Wch1 = 1.704145e-02; + constexpr double mdl_Wneu2 = 2.077700e-02; + constexpr double mdl_Wgo = 5.506754e+00; + constexpr double mdl_Wsn3 = 1.475190e-01; + constexpr double mdl_Wsl3 = 1.483273e-01; + constexpr double mdl_Wsn2 = 1.498816e-01; + constexpr double mdl_Wsl2 = 2.136822e-01; + constexpr double mdl_Wsn1 = 1.498816e-01; + constexpr double mdl_Wsl1 = 2.136822e-01; + constexpr double mdl_Wsu3 = 2.021596e+00; + constexpr double mdl_Wsd3 = 3.736276e+00; + constexpr double mdl_Wsu2 = 5.477195e+00; + constexpr double mdl_Wsd2 = 5.312788e+00; + constexpr double mdl_Wsu1 = 5.477195e+00; + constexpr double mdl_Wsd1 = 5.312788e+00; + constexpr double mdl_WH = 5.469628e-01; + constexpr double mdl_WA0 = 6.321785e-01; + constexpr double mdl_WH02 = 5.748014e-01; + constexpr double mdl_WH01 = 1.986108e-03; + constexpr double mdl_WW = 2.002822e+00; + constexpr double mdl_WZ = 2.411433e+00; + constexpr double mdl_WT = 1.561950e+00; + constexpr double mdl_Ryu3x3 = 8.928445e-01; + constexpr double mdl_Rye3x3 = 1.008908e-01; + constexpr double mdl_Ryd3x3 = 1.388402e-01; + constexpr double mdl_RVV2x2 = 9.725578e-01; + constexpr double mdl_RVV2x1 = 2.326612e-01; + constexpr double mdl_RVV1x2 = -2.326612e-01; + constexpr double mdl_RVV1x1 = 9.725578e-01; + constexpr double mdl_RCKM3x3 = 1.000000e+00; + constexpr double mdl_RCKM2x2 = 1.000000e+00; + constexpr double mdl_RCKM1x1 = 1.000000e+00; + constexpr double mdl_RRu6x6 = -5.536450e-01; + constexpr double mdl_RRu6x3 = 8.327528e-01; + constexpr double mdl_RRu5x5 = 1.000000e+00; + constexpr double mdl_RRu4x4 = 1.000000e+00; + constexpr double mdl_RRu3x6 = 8.327528e-01; + constexpr double mdl_RRu3x3 = 5.536450e-01; + constexpr double mdl_RRu2x2 = 1.000000e+00; + constexpr double mdl_RRu1x1 = 1.000000e+00; + constexpr double mdl_RMNS3x3 = 1.000000e+00; + constexpr double mdl_RMNS2x2 = 1.000000e+00; + constexpr double mdl_RMNS1x1 = 1.000000e+00; + constexpr double mdl_RUU2x2 = 9.168349e-01; + constexpr double mdl_RUU2x1 = 3.992666e-01; + constexpr double mdl_RUU1x2 = -3.992666e-01; + constexpr double mdl_RUU1x1 = 9.168349e-01; + constexpr double mdl_Rtu3x3 = -4.447525e+02; + constexpr double mdl_Rte3x3 = -2.540197e+01; + constexpr double mdl_Rtd3x3 = -1.106937e+02; + constexpr double mdl_RRn3x3 = 1.000000e+00; + constexpr double mdl_RRn2x2 = 1.000000e+00; + constexpr double mdl_RRn1x1 = 1.000000e+00; + //constexpr double aS = 1.180000e-01; // now retrieved event-by-event (as G) from Fortran (running alphas #373) + constexpr double aEWM1 = 1.279340e+02; + constexpr double mdl_RRl6x6 = -2.824872e-01; + constexpr double mdl_RRl6x3 = 9.592711e-01; + constexpr double mdl_RRl5x5 = 1.000000e+00; + constexpr double mdl_RRl4x4 = 1.000000e+00; + constexpr double mdl_RRl3x6 = 9.592711e-01; + constexpr double mdl_RRl3x3 = 2.824872e-01; + constexpr double mdl_RRl2x2 = 1.000000e+00; + constexpr double mdl_RRl1x1 = 1.000000e+00; + constexpr double mdl_RNN4x4 = -6.843778e-01; + constexpr double mdl_RNN4x3 = 6.492260e-01; + constexpr double mdl_RNN4x2 = 3.107390e-01; + constexpr double mdl_RNN4x1 = -1.165071e-01; + constexpr double mdl_RNN3x4 = 7.102270e-01; + constexpr double mdl_RNN3x3 = 6.958775e-01; + constexpr double mdl_RNN3x2 = 8.770049e-02; + constexpr double mdl_RNN3x1 = -6.033880e-02; + constexpr double mdl_RNN2x4 = 1.561507e-01; + constexpr double mdl_RNN2x3 = -2.698467e-01; + constexpr double mdl_RNN2x2 = 9.449493e-01; + constexpr double mdl_RNN2x1 = 9.935054e-02; + constexpr double mdl_RNN1x4 = -5.311861e-02; + constexpr double mdl_RNN1x3 = 1.464340e-01; + constexpr double mdl_RNN1x2 = -5.311036e-02; + constexpr double mdl_RNN1x1 = 9.863644e-01; + constexpr double mdl_RmU23x3 = 1.791371e+05; + constexpr double mdl_RmU21x1 = 2.803821e+05; + constexpr double mdl_RmQ23x3 = 2.487654e+05; + constexpr double mdl_RmQ21x1 = 2.998367e+05; + constexpr double mdl_mHu2 = -1.288001e+05; + constexpr double mdl_mHd2 = 3.233749e+04; + constexpr double mdl_RMx3 = 5.882630e+02; + constexpr double mdl_RMx2 = 1.915042e+02; + constexpr double mdl_RMx1 = 1.013965e+02; + constexpr double mdl_RmL23x3 = 3.782868e+04; + constexpr double mdl_RmL21x1 = 3.815567e+04; + constexpr double mdl_RmE23x3 = 1.796764e+04; + constexpr double mdl_RmE21x1 = 1.863063e+04; + constexpr double mdl_RmD23x3 = 2.702620e+05; + constexpr double mdl_RmD21x1 = 2.736847e+05; + constexpr double mdl_Msl6 = 2.068678e+02; + constexpr double mdl_Msl4 = 1.441028e+02; + constexpr double mdl_Msu6 = 5.857858e+02; + constexpr double mdl_Msd6 = 5.437267e+02; + constexpr double mdl_Msu4 = 5.492593e+02; + constexpr double mdl_Msd4 = 5.452285e+02; + constexpr double mdl_Mch2 = 3.799393e+02; + constexpr double mdl_Mneu4 = 3.817294e+02; + constexpr double mdl_Mneu3 = -3.637560e+02; + constexpr double mdl_Mch1 = 1.816965e+02; + constexpr double mdl_Mneu2 = 1.810882e+02; + constexpr double mdl_Mneu1 = 9.668807e+01; + constexpr double mdl_Mgo = 6.077137e+02; + constexpr double mdl_Msn3 = 1.847085e+02; + constexpr double mdl_Msl3 = 1.344909e+02; + constexpr double mdl_Msn1 = 1.852583e+02; + constexpr double mdl_Msl1 = 2.029157e+02; + constexpr double mdl_Msu3 = 3.996685e+02; + constexpr double mdl_Msd3 = 5.130652e+02; + constexpr double mdl_Msu1 = 5.611190e+02; + constexpr double mdl_Msd1 = 5.684411e+02; + constexpr double mdl_MH = 4.078790e+02; + constexpr double mdl_MA0 = 3.995839e+02; + constexpr double mdl_MH02 = 3.999601e+02; + constexpr double mdl_MH01 = 1.108991e+02; + constexpr double mdl_MW = 7.982901e+01; + constexpr double mdl_MZ = 9.118760e+01; + constexpr double mdl_Mta = 1.777000e+00; + constexpr double mdl_MT = 1.750000e+02; + constexpr double mdl_MB = 4.889917e+00; + constexpr double mdl_MA2 = 1.664391e+05; + constexpr double mdl_tb = 9.748624e+00; + constexpr double mdl_RMUH = 3.576810e+02; + constexpr double mdl_alp = -1.138252e-01; + constexpr double mdl_RRd6x6 = 9.387379e-01; + constexpr double mdl_RRd6x3 = -3.446319e-01; + constexpr double mdl_RRd5x5 = 1.000000e+00; + constexpr double mdl_RRd4x4 = 1.000000e+00; + constexpr double mdl_RRd3x6 = 3.446319e-01; + constexpr double mdl_RRd3x3 = 9.387379e-01; + constexpr double mdl_RRd2x2 = 1.000000e+00; + constexpr double mdl_RRd1x1 = 1.000000e+00; + constexpr double mdl_Msd5 = 1. * mdl_Msd4; + constexpr double mdl_Msd2 = 1. * mdl_Msd1; + constexpr double mdl_Msu5 = 1. * mdl_Msu4; + constexpr double mdl_Msu2 = 1. * mdl_Msu1; + constexpr double mdl_Msl5 = 1. * mdl_Msl4; + constexpr double mdl_Msl2 = 1. * mdl_Msl1; + constexpr double mdl_Msn2 = 1. * mdl_Msn1; + constexpr double mdl_RmU22x2 = 1. * mdl_RmU21x1; + constexpr double mdl_RmQ22x2 = 1. * mdl_RmQ21x1; + constexpr double mdl_RmL22x2 = 1. * mdl_RmL21x1; + constexpr double mdl_RmE22x2 = 1. * mdl_RmE21x1; + constexpr double mdl_RmD22x2 = 1. * mdl_RmD21x1; + constexpr double mdl_conjg__Rn3x3 = 1.; + constexpr double mdl_conjg__CKM3x3 = 1.; + constexpr double mdl_Ru4x4 = 1.; + constexpr double mdl_Ru1x1 = 1.; + constexpr double mdl_Rn3x3 = 1.; + constexpr double mdl_Rn1x1 = 1.; + constexpr double mdl_Rl4x4 = 1.; + constexpr double mdl_Rl1x1 = 1.; + constexpr double mdl_Rd4x4 = 1.; + constexpr double mdl_Rd1x1 = 1.; + constexpr double mdl_I98x11 = 1.; + constexpr double mdl_I97x11 = 1.; + constexpr double mdl_I96x11 = 1.; + constexpr double mdl_I93x11 = 1.; + constexpr double mdl_I92x11 = 1.; + constexpr double mdl_I87x11 = 1.; + constexpr double mdl_I82x11 = 1.; + constexpr double mdl_I74x11 = 1.; + constexpr double mdl_I6x44 = 1.; + constexpr double mdl_I5x11 = 1.; + constexpr double mdl_I53x11 = 1.; + constexpr double mdl_I52x44 = 1.; + constexpr double mdl_I51x11 = 1.; + constexpr double mdl_I39x11 = 1.; + constexpr double mdl_I31x11 = 1.; + constexpr double mdl_I26x44 = 1.; + constexpr double mdl_I25x11 = 1.; + constexpr double mdl_I12x11 = 1.; + constexpr double mdl_I102x44 = 1.; + constexpr double mdl_I101x44 = 1.; + constexpr double mdl_I100x44 = 1.; + constexpr double mdl_CKM3x3 = 1.; + constexpr double mdl_atan__tb = atan( mdl_tb ); + constexpr double mdl_beta = mdl_atan__tb; + constexpr double mdl_cw = mdl_MW / mdl_MZ; + constexpr cxsmpl mdl_mD21x1 = mdl_RmD21x1; + constexpr cxsmpl mdl_mD22x2 = mdl_RmD22x2; + constexpr cxsmpl mdl_mD23x3 = mdl_RmD23x3; + constexpr cxsmpl mdl_mE21x1 = mdl_RmE21x1; + constexpr cxsmpl mdl_mE22x2 = mdl_RmE22x2; + constexpr cxsmpl mdl_mE23x3 = mdl_RmE23x3; + constexpr cxsmpl mdl_mL21x1 = mdl_RmL21x1; + constexpr cxsmpl mdl_mL22x2 = mdl_RmL22x2; + constexpr cxsmpl mdl_mL23x3 = mdl_RmL23x3; + constexpr cxsmpl mdl_mQ21x1 = mdl_RmQ21x1; + constexpr cxsmpl mdl_mQ22x2 = mdl_RmQ22x2; + constexpr cxsmpl mdl_mQ23x3 = mdl_RmQ23x3; + constexpr cxsmpl mdl_mU21x1 = mdl_RmU21x1; + constexpr cxsmpl mdl_mU22x2 = mdl_RmU22x2; + constexpr cxsmpl mdl_mU23x3 = mdl_RmU23x3; + constexpr cxsmpl mdl_MUH = mdl_RMUH; + constexpr cxsmpl mdl_Mx1 = mdl_RMx1; + constexpr cxsmpl mdl_Mx2 = mdl_RMx2; + constexpr cxsmpl mdl_Mx3 = mdl_RMx3; + constexpr cxsmpl mdl_NN1x1 = mdl_RNN1x1; + constexpr cxsmpl mdl_NN1x2 = mdl_RNN1x2; + constexpr cxsmpl mdl_NN1x3 = mdl_RNN1x3; + constexpr cxsmpl mdl_NN1x4 = mdl_RNN1x4; + constexpr cxsmpl mdl_NN2x1 = mdl_RNN2x1; + constexpr cxsmpl mdl_NN2x2 = mdl_RNN2x2; + constexpr cxsmpl mdl_NN2x3 = mdl_RNN2x3; + constexpr cxsmpl mdl_NN2x4 = mdl_RNN2x4; + constexpr cxsmpl mdl_NN3x1 = mdl_RNN3x1; + constexpr cxsmpl mdl_NN3x2 = mdl_RNN3x2; + constexpr cxsmpl mdl_NN3x3 = mdl_RNN3x3; + constexpr cxsmpl mdl_NN3x4 = mdl_RNN3x4; + constexpr cxsmpl mdl_NN4x1 = mdl_RNN4x1; + constexpr cxsmpl mdl_NN4x2 = mdl_RNN4x2; + constexpr cxsmpl mdl_NN4x3 = mdl_RNN4x3; + constexpr cxsmpl mdl_NN4x4 = mdl_RNN4x4; + constexpr cxsmpl mdl_Rd3x3 = mdl_RRd3x3; + constexpr cxsmpl mdl_Rd3x6 = mdl_RRd3x6; + constexpr cxsmpl mdl_Rd6x3 = mdl_RRd6x3; + constexpr cxsmpl mdl_Rd6x6 = mdl_RRd6x6; + constexpr cxsmpl mdl_Rl3x3 = mdl_RRl3x3; + constexpr cxsmpl mdl_Rl3x6 = mdl_RRl3x6; + constexpr cxsmpl mdl_Rl6x3 = mdl_RRl6x3; + constexpr cxsmpl mdl_Rl6x6 = mdl_RRl6x6; + constexpr cxsmpl mdl_Ru3x3 = mdl_RRu3x3; + constexpr cxsmpl mdl_Ru3x6 = mdl_RRu3x6; + constexpr cxsmpl mdl_Ru6x3 = mdl_RRu6x3; + constexpr cxsmpl mdl_Ru6x6 = mdl_RRu6x6; + constexpr cxsmpl mdl_UU1x1 = mdl_RUU1x1; + constexpr cxsmpl mdl_UU1x2 = mdl_RUU1x2; + constexpr cxsmpl mdl_UU2x1 = mdl_RUU2x1; + constexpr cxsmpl mdl_UU2x2 = mdl_RUU2x2; + constexpr cxsmpl mdl_VV1x1 = mdl_RVV1x1; + constexpr cxsmpl mdl_VV1x2 = mdl_RVV1x2; + constexpr cxsmpl mdl_VV2x1 = mdl_RVV2x1; + constexpr cxsmpl mdl_VV2x2 = mdl_RVV2x2; + constexpr cxsmpl mdl_td3x3 = mdl_Rtd3x3; + constexpr cxsmpl mdl_te3x3 = mdl_Rte3x3; + constexpr cxsmpl mdl_tu3x3 = mdl_Rtu3x3; + constexpr cxsmpl mdl_yd3x3 = mdl_Ryd3x3; + constexpr cxsmpl mdl_ye3x3 = mdl_Rye3x3; + constexpr cxsmpl mdl_yu3x3 = mdl_Ryu3x3; + constexpr double mdl_MZ__exp__2 = ( ( mdl_MZ ) * ( mdl_MZ ) ); + constexpr cxsmpl mdl_bb = ( ( -mdl_mHd2 + mdl_mHu2 - mdl_MZ__exp__2 * cos( 2. * mdl_beta ) ) * tan( 2. * mdl_beta ) ) / 2.; + constexpr double mdl_cw__exp__2 = ( ( mdl_cw ) * ( mdl_cw ) ); + constexpr double mdl_sw = constexpr_sqrt( 1. - mdl_cw__exp__2 ); + constexpr double mdl_cos__beta = cos( mdl_beta ); + constexpr double mdl_sin__beta = sin( mdl_beta ); + constexpr cxsmpl mdl_conjg__yu3x3 = conj( mdl_yu3x3 ); + constexpr cxsmpl mdl_I1x33 = mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_conjg__yd3x3 = conj( mdl_yd3x3 ); + constexpr cxsmpl mdl_I10x33 = mdl_Rd3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I10x36 = mdl_Rd6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_conjg__Rd3x6 = conj( mdl_Rd3x6 ); + constexpr cxsmpl mdl_I100x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_I100x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_conjg__Rd6x6 = conj( mdl_Rd6x6 ); + constexpr cxsmpl mdl_I100x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_I100x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_conjg__Rl3x6 = conj( mdl_Rl3x6 ); + constexpr cxsmpl mdl_I101x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_I101x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_conjg__Rl6x6 = conj( mdl_Rl6x6 ); + constexpr cxsmpl mdl_I101x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_I101x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_conjg__Ru3x6 = conj( mdl_Ru3x6 ); + constexpr cxsmpl mdl_I102x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_I102x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_conjg__Ru6x6 = conj( mdl_Ru6x6 ); + constexpr cxsmpl mdl_I102x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I102x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I11x33 = mdl_Rd3x6 * mdl_yd3x3; + constexpr cxsmpl mdl_I11x36 = mdl_Rd6x6 * mdl_yd3x3; + constexpr cxsmpl mdl_conjg__Rd3x3 = conj( mdl_Rd3x3 ); + constexpr cxsmpl mdl_I12x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I12x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_conjg__Rd6x3 = conj( mdl_Rd6x3 ); + constexpr cxsmpl mdl_I12x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I12x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I13x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_I13x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_I13x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_I13x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_conjg__td3x3 = conj( mdl_td3x3 ); + constexpr cxsmpl mdl_I14x33 = mdl_Rd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I14x36 = mdl_Rd6x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I14x63 = mdl_Rd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I14x66 = mdl_Rd6x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I15x33 = mdl_Rd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I15x36 = mdl_Rd6x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I15x63 = mdl_Rd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I15x66 = mdl_Rd6x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I16x33 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I16x36 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I16x63 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I16x66 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I17x33 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I17x36 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I17x63 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I17x66 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I18x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I18x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I18x63 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I18x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I19x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I19x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I19x63 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I19x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I2x33 = mdl_yd3x3 * mdl_conjg__CKM3x3; + constexpr cxsmpl mdl_I20x33 = mdl_CKM3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I21x33 = mdl_CKM3x3 * mdl_yu3x3; + constexpr cxsmpl mdl_conjg__ye3x3 = conj( mdl_ye3x3 ); + constexpr cxsmpl mdl_I22x33 = mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I23x33 = mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I23x36 = mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_conjg__Rl3x3 = conj( mdl_Rl3x3 ); + constexpr cxsmpl mdl_I24x33 = mdl_ye3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_conjg__Rl6x3 = conj( mdl_Rl6x3 ); + constexpr cxsmpl mdl_I24x36 = mdl_ye3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I25x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I25x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I25x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I25x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I26x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_I26x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_I26x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_I26x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_I27x33 = mdl_Rl3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I27x36 = mdl_Rl6x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I28x33 = mdl_Rl3x6 * mdl_ye3x3; + constexpr cxsmpl mdl_I28x36 = mdl_Rl6x6 * mdl_ye3x3; + constexpr cxsmpl mdl_I29x33 = mdl_Rl3x3; + constexpr cxsmpl mdl_I29x36 = mdl_Rl6x3; + constexpr cxsmpl mdl_I3x33 = mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I3x36 = mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I30x33 = mdl_Rl3x6 * mdl_ye3x3; + constexpr cxsmpl mdl_I30x36 = mdl_Rl6x6 * mdl_ye3x3; + constexpr cxsmpl mdl_I31x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I31x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I31x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I31x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I32x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_I32x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_I32x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_I32x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_conjg__te3x3 = conj( mdl_te3x3 ); + constexpr cxsmpl mdl_I33x33 = mdl_Rl3x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I33x36 = mdl_Rl6x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I33x63 = mdl_Rl3x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I33x66 = mdl_Rl6x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I34x33 = mdl_Rl3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I34x36 = mdl_Rl6x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I34x63 = mdl_Rl3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I34x66 = mdl_Rl6x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I35x33 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I35x36 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I35x63 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I35x66 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I36x33 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I36x36 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I36x63 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I36x66 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I37x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I37x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I37x63 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I37x66 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I38x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I38x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I38x63 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I38x66 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I39x33 = mdl_Rl3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I39x36 = mdl_Rl6x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I4x33 = mdl_yd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I4x36 = mdl_yd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I40x33 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I40x36 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I41x33 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rn3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I41x36 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rn3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I42x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I42x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I44x33 = mdl_Rn3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I45x33 = mdl_Rn3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I45x36 = mdl_Rn3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I46x33 = mdl_Rn3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I46x36 = mdl_Rn3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I47x33 = mdl_Rn3x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I47x36 = mdl_Rn3x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I48x33 = mdl_Rn3x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I48x36 = mdl_Rn3x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I49x33 = mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I49x36 = mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I5x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I5x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I5x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I5x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_conjg__Ru3x3 = conj( mdl_Ru3x3 ); + constexpr cxsmpl mdl_I50x33 = mdl_yu3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_conjg__Ru6x3 = conj( mdl_Ru6x3 ); + constexpr cxsmpl mdl_I50x36 = mdl_yu3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I51x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I51x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I51x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I51x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I52x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_I52x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_I52x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I52x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I53x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I53x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I53x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I53x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_conjg__tu3x3 = conj( mdl_tu3x3 ); + constexpr cxsmpl mdl_I54x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I54x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I54x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I54x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I55x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I55x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I55x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I55x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I56x33 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I56x36 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I56x63 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I56x66 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I57x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I57x36 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I57x63 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I57x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I58x33 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I58x36 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I58x63 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I58x66 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I59x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I59x36 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I59x63 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I59x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I6x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_I6x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_I6x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_I6x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_I60x33 = mdl_Rd3x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I60x36 = mdl_Rd3x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I60x63 = mdl_Rd6x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I60x66 = mdl_Rd6x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I61x33 = mdl_Ru3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I61x36 = mdl_Ru6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I62x33 = mdl_Ru3x6 * mdl_yu3x3; + constexpr cxsmpl mdl_I62x36 = mdl_Ru6x6 * mdl_yu3x3; + constexpr cxsmpl mdl_I63x33 = mdl_CKM3x3 * mdl_Ru3x3; + constexpr cxsmpl mdl_I63x36 = mdl_CKM3x3 * mdl_Ru6x3; + constexpr cxsmpl mdl_I64x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I64x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I65x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3; + constexpr cxsmpl mdl_I65x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3; + constexpr cxsmpl mdl_I66x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I66x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I66x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I66x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I67x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I67x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I67x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I67x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I68x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I68x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I68x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I68x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I69x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I69x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I69x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I69x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I7x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3; + constexpr cxsmpl mdl_I7x36 = mdl_Rd6x3 * mdl_conjg__CKM3x3; + constexpr cxsmpl mdl_I70x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I70x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I70x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I70x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I71x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I71x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I71x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I71x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I72x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I72x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I72x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I72x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I73x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I73x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I73x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I73x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I74x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I74x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I74x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I74x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I75x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_I75x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_I75x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I75x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I76x33 = mdl_Ru3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I76x36 = mdl_Ru6x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I76x63 = mdl_Ru3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I76x66 = mdl_Ru6x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I77x33 = mdl_Ru3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I77x36 = mdl_Ru6x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I77x63 = mdl_Ru3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I77x66 = mdl_Ru6x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I78x33 = mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I78x36 = mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I78x63 = mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I78x66 = mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I79x33 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I79x36 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I79x63 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I79x66 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I8x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I8x36 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I80x33 = mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I80x36 = mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I80x63 = mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I80x66 = mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I81x33 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I81x36 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I81x63 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I81x66 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I82x33 = mdl_CKM3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I82x36 = mdl_CKM3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I83x33 = mdl_CKM3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I83x36 = mdl_CKM3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I84x33 = mdl_CKM3x3 * mdl_yu3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I84x36 = mdl_CKM3x3 * mdl_yu3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I85x33 = mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I85x36 = mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I86x33 = mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I86x36 = mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I88x33 = mdl_ye3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I89x33 = mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I89x36 = mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I9x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3; + constexpr cxsmpl mdl_I9x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3; + constexpr cxsmpl mdl_I90x33 = mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I90x36 = mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I91x33 = mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I91x36 = mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I92x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I92x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I92x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I92x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I93x33 = mdl_Rn3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I93x36 = mdl_Rn3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I94x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I94x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I94x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I94x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I95x33 = mdl_Rl3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I95x36 = mdl_Rl6x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I96x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I96x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I96x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I96x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I97x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I97x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I97x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I97x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I98x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I98x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I98x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I98x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I99x33 = mdl_ye3x3; + constexpr cxsmpl mdl_complexi = cxsmpl( 0., 1. ); + constexpr double mdl_sqrt__2 = constexpr_sqrt( 2. ); + constexpr double mdl_sw__exp__2 = ( ( mdl_sw ) * ( mdl_sw ) ); + constexpr cxsmpl mdl_conjg__NN1x1 = conj( mdl_NN1x1 ); + constexpr cxsmpl mdl_conjg__NN1x2 = conj( mdl_NN1x2 ); + constexpr cxsmpl mdl_conjg__NN1x3 = conj( mdl_NN1x3 ); + constexpr cxsmpl mdl_conjg__NN1x4 = conj( mdl_NN1x4 ); + constexpr cxsmpl mdl_conjg__NN2x1 = conj( mdl_NN2x1 ); + constexpr cxsmpl mdl_conjg__NN2x2 = conj( mdl_NN2x2 ); + constexpr cxsmpl mdl_conjg__NN2x3 = conj( mdl_NN2x3 ); + constexpr cxsmpl mdl_conjg__NN2x4 = conj( mdl_NN2x4 ); + constexpr cxsmpl mdl_conjg__NN3x1 = conj( mdl_NN3x1 ); + constexpr cxsmpl mdl_conjg__NN3x2 = conj( mdl_NN3x2 ); + constexpr cxsmpl mdl_conjg__NN3x3 = conj( mdl_NN3x3 ); + constexpr cxsmpl mdl_conjg__NN3x4 = conj( mdl_NN3x4 ); + constexpr cxsmpl mdl_conjg__NN4x1 = conj( mdl_NN4x1 ); + constexpr cxsmpl mdl_conjg__NN4x2 = conj( mdl_NN4x2 ); + constexpr cxsmpl mdl_conjg__NN4x3 = conj( mdl_NN4x3 ); + constexpr cxsmpl mdl_conjg__NN4x4 = conj( mdl_NN4x4 ); + constexpr cxsmpl mdl_conjg__UU1x1 = conj( mdl_UU1x1 ); + constexpr cxsmpl mdl_conjg__UU1x2 = conj( mdl_UU1x2 ); + constexpr cxsmpl mdl_conjg__UU2x1 = conj( mdl_UU2x1 ); + constexpr cxsmpl mdl_conjg__UU2x2 = conj( mdl_UU2x2 ); + constexpr cxsmpl mdl_conjg__VV1x1 = conj( mdl_VV1x1 ); + constexpr cxsmpl mdl_conjg__VV1x2 = conj( mdl_VV1x2 ); + constexpr cxsmpl mdl_conjg__VV2x1 = conj( mdl_VV2x1 ); + constexpr cxsmpl mdl_conjg__VV2x2 = conj( mdl_VV2x2 ); + constexpr double mdl_cos__alp = cos( mdl_alp ); + constexpr double mdl_sin__alp = sin( mdl_alp ); + constexpr cxsmpl mdl_conjg__MUH = conj( mdl_MUH ); + constexpr double mdl_ee = 2. * constexpr_sqrt( 1. / aEWM1 ) * constexpr_sqrt( M_PI ); + constexpr double mdl_gp = mdl_ee / mdl_cw; + constexpr double mdl_gw = mdl_ee / mdl_sw; + constexpr double mdl_vev = ( 2. * mdl_cw * mdl_MZ * mdl_sw ) / mdl_ee; + constexpr double mdl_vd = mdl_vev * mdl_cos__beta; + constexpr double mdl_vu = mdl_vev * mdl_sin__beta; + constexpr double mdl_ee__exp__2 = ( ( mdl_ee ) * ( mdl_ee ) ); + + if( mdl_Mneu2 < 0 ) + mdl_Wneu2 = -abs( mdl_Wneu2 ); + if( mdl_Mneu3 < 0 ) + mdl_Wneu3 = -abs( mdl_Wneu3 ); + if( mdl_Mneu4 < 0 ) + mdl_Wneu4 = -abs( mdl_Wneu4 ); + if( mdl_Mgo < 0 ) + mdl_Wgo = -abs( mdl_Wgo ); + // Model couplings independent of aS + // (none) + + // Model parameters dependent on aS + //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) + + // Model couplings dependent on aS + //constexpr cxsmpl GC_6 = -G; // now computed event-by-event (running alphas #373) + //constexpr cxsmpl GC_51 = -( mdl_complexi * G * mdl_I51x11 ); // now computed event-by-event (running alphas #373) + + // Print parameters that are unchanged during the run + void printIndependentParameters(); + + // Print couplings that are unchanged during the run + void printIndependentCouplings(); + + // Print parameters that are changed event by event + //void printDependentParameters(); // now computed event-by-event (running alphas #373) + + // Print couplings that are changed event by event + //void printDependentCouplings(); // now computed event-by-event (running alphas #373) +} + +#endif + +//========================================================================== + +namespace Parameters_MSSM_SLHA2_dependentCouplings +{ + constexpr size_t ndcoup = 2; // #couplings that vary event by event because they depend on the running alphas QCD + constexpr size_t idcoup_GC_6 = 0; + constexpr size_t idcoup_GC_51 = 1; + struct DependentCouplings_sv + { + cxtype_sv GC_6; + cxtype_sv GC_51; + }; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#ifdef __CUDACC__ +#pragma nv_diagnostic push +#pragma nv_diag_suppress 177 // e.g. <> +#endif + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + { +#ifdef MGONGPU_HARDCODE_PARAM + using namespace Parameters_MSSM_SLHA2; +#endif + // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_MSSM_SLHA2) because: + // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below + const cxtype cI( 0., 1. ); + DependentCouplings_sv out; + // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) +#if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT ) + { + const fptype_sv& G = G_sv; + // Model parameters dependent on aS + //const fptype_sv mdl_sqrt__aS = constexpr_sqrt( aS ); + //const fptype_sv G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); + constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); + // Model couplings dependent on aS + out.GC_6 = -G; + out.GC_51 = -( cI * G * mdl_I51x11 ); + } +#else + // ** NB #439: special handling is necessary ONLY FOR VECTORS OF FLOATS (variable Gs are vector floats, fixed parameters are scalar doubles) + // Use an explicit loop to avoid <> + // Problems may come e.g. in EFTs from multiplying a vector float (related to aS-dependent G) by a scalar double (aS-independent parameters) + fptype_v GC_6r_v; + fptype_v GC_6i_v; + fptype_v GC_51r_v; + fptype_v GC_51i_v; + for( int i = 0; i < neppV; i++ ) + { + const fptype& G = G_sv[i]; + // Model parameters dependent on aS + //const fptype mdl_sqrt__aS = constexpr_sqrt( aS ); + //const fptype G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); + constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); + // Model couplings dependent on aS + const cxtype GC_6 = -G; + const cxtype GC_51 = -( cI * G * mdl_I51x11 ); + GC_6r_v[i] = cxreal( GC_6 ); + GC_6i_v[i] = cximag( GC_6 ); + GC_51r_v[i] = cxreal( GC_51 ); + GC_51i_v[i] = cximag( GC_51 ); + } + out.GC_6 = cxtype_v( GC_6r_v, GC_6i_v ); + out.GC_51 = cxtype_v( GC_51r_v, GC_51i_v ); +#endif + // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) + return out; + } +#ifdef __CUDACC__ +#pragma GCC diagnostic pop +#pragma nv_diagnostic pop +#endif +} + +//========================================================================== + +namespace Parameters_MSSM_SLHA2_independentCouplings +{ + constexpr size_t nicoup = 0; // #couplings that are fixed for all events because they do not depend on the running alphas QCD + // NB: there are no aS-independent couplings in this physics process +} + +//========================================================================== + +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ +#pragma GCC diagnostic push +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> +#endif + // Compute the output couplings (e.g. gc10 and gc11) from the input gs + template + __device__ inline void + G2COUP( const fptype gs[], + fptype couplings[] ) + { + mgDebug( 0, __FUNCTION__ ); + using namespace Parameters_MSSM_SLHA2_dependentCouplings; + const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); + fptype* GC_51s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 ); + cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s ); + cxtype_sv_ref GC_51s_sv = C_ACCESS::kernelAccess( GC_51s ); + GC_6s_sv = couplings_sv.GC_6; + GC_51s_sv = couplings_sv.GC_51; + mgDebug( 1, __FUNCTION__ ); + return; + } +#pragma GCC diagnostic pop +} + +//========================================================================== + +#endif // Parameters_MSSM_SLHA2_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk new file mode 100644 index 0000000000..4dbc05afe1 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk @@ -0,0 +1,268 @@ +#=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) +#=== NB: assume that the same name (e.g. cudacpp.mk, Makefile...) is used in the Subprocess and src directories + +THISMK = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)) + +#------------------------------------------------------------------------------- + +#=== Use bash in the Makefile (https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html) + +SHELL := /bin/bash + +#------------------------------------------------------------------------------- + +#=== Configure common compiler flags for CUDA and C++ + +INCFLAGS = -I. +OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here + +#------------------------------------------------------------------------------- + +#=== Configure the C++ compiler + +CXXFLAGS = $(OPTFLAGS) -std=c++17 $(INCFLAGS) $(USE_NVTX) -fPIC -Wall -Wshadow -Wextra +ifeq ($(shell $(CXX) --version | grep ^nvc++),) +CXXFLAGS+= -ffast-math # see issue #117 +endif +###CXXFLAGS+= -Ofast # performance is not different from --fast-math +###CXXFLAGS+= -g # FOR DEBUGGING ONLY + +# Note: AR, CXX and FC are implicitly defined if not set externally +# See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html +###RANLIB = ranlib + +#------------------------------------------------------------------------------- + +#=== Configure ccache for CUDA and C++ builds + +# Enable ccache if USECCACHE=1 +ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) + override CXX:=ccache $(CXX) +endif +#ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) +# override AR:=ccache $(AR) +#endif +#ifneq ($(NVCC),) +# ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) +# override NVCC:=ccache $(NVCC) +# endif +#endif + +#------------------------------------------------------------------------------- + +#=== Configure PowerPC-specific compiler flags for CUDA and C++ + +# Assuming uname is available, detect if architecture is PowerPC +UNAME_P := $(shell uname -p) + +# PowerPC-specific CXX compiler flags (being reviewed) +ifeq ($(UNAME_P),ppc64le) + CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4 + # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6 + ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change + ###CXXFLAGS+= -fpeel-loops # no change + ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4 + ###CXXFLAGS+= -ftree-vectorize # no change + ###CXXFLAGS+= -flto # BUILD ERROR IF THIS ADDED IN SRC?! +else + ###AR=gcc-ar # needed by -flto + ###RANLIB=gcc-ranlib # needed by -flto + ###CXXFLAGS+= -flto # NB: build error from src/Makefile unless gcc-ar and gcc-ranlib are used + ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) +endif + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN + +# Set the build flags appropriate to OMPFLAGS +###$(info OMPFLAGS=$(OMPFLAGS)) +CXXFLAGS += $(OMPFLAGS) + +# Set the build flags appropriate to each AVX choice (example: "make AVX=none") +# [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] +# [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] +$(info AVX=$(AVX)) +ifeq ($(UNAME_P),ppc64le) + ifeq ($(AVX),sse4) + override AVXFLAGS = -D__SSE4_2__ # Power9 VSX with 128 width (VSR registers) + else ifneq ($(AVX),none) + $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on PowerPC for the moment) + endif +else ifeq ($(UNAME_P),arm) + ifeq ($(AVX),sse4) + override AVXFLAGS = -D__SSE4_2__ # ARM NEON with 128 width (Q/quadword registers) + else ifneq ($(AVX),none) + $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on ARM for the moment) + endif +else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 + ifeq ($(AVX),none) + override AVXFLAGS = -mno-sse3 # no SIMD + else ifeq ($(AVX),sse4) + override AVXFLAGS = -mno-avx # SSE4.2 with 128 width (xmm registers) + else ifeq ($(AVX),avx2) + override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang] + else ifeq ($(AVX),512y) + override AVXFLAGS = -march=skylake -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] + else ifeq ($(AVX),512z) + override AVXFLAGS = -march=skylake -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else + $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported) + endif +else + ifeq ($(AVX),sse4) + override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers) + else ifeq ($(AVX),avx2) + override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang] + else ifeq ($(AVX),512y) + override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] + else ifeq ($(AVX),512z) + override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) + else ifneq ($(AVX),none) + $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported) + endif +endif +# For the moment, use AVXFLAGS everywhere: eventually, use them only in encapsulated implementations? +CXXFLAGS+= $(AVXFLAGS) + +# Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") +###$(info FPTYPE=$(FPTYPE)) +ifeq ($(FPTYPE),d) + CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE +else ifeq ($(FPTYPE),f) + CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT +else ifeq ($(FPTYPE),m) + CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT +else + $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) +endif + +# Set the build flags appropriate to each HELINL choice (example: "make HELINL=1") +###$(info HELINL=$(HELINL)) +ifeq ($(HELINL),1) + CXXFLAGS += -DMGONGPU_INLINE_HELAMPS +else ifneq ($(HELINL),0) + $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) +endif + +# Set the build flags appropriate to each HRDCOD choice (example: "make HRDCOD=1") +###$(info HRDCOD=$(HRDCOD)) +ifeq ($(HRDCOD),1) + CXXFLAGS += -DMGONGPU_HARDCODE_PARAM +else ifneq ($(HRDCOD),0) + $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) +endif + +# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") +###$(info RNDGEN=$(RNDGEN)) +ifeq ($(RNDGEN),hasNoCurand) + CXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifneq ($(RNDGEN),hasCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +#------------------------------------------------------------------------------- + +#=== Configure build directories and build lockfiles === + +# Build directory "short" tag (defines target and path to the optional build directory) +# (Rationale: keep directory names shorter, e.g. do not include random number generator choice) +override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) + +# Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) +# (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) + +# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 +###$(info Current directory is $(shell pwd)) +ifeq ($(USEBUILDDIR),1) + override BUILDDIR = build.$(DIRTAG) + override LIBDIRREL = ../lib/$(BUILDDIR) + ###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR=1 is set)) +else + override BUILDDIR = . + override LIBDIRREL = ../lib + ###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is not set)) +endif +######$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG)) + +# Workaround for Mac #375 (I did not manage to fix rpath with @executable_path): use absolute paths for LIBDIR +# (NB: this is quite ugly because it creates the directory if it does not exist - to avoid removing src by mistake) +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S),Darwin) +override LIBDIR = $(shell mkdir -p $(LIBDIRREL); cd $(LIBDIRREL); pwd) +ifeq ($(wildcard $(LIBDIR)),) +$(error Directory LIBDIR="$(LIBDIR)" should have been created by now) +endif +else +override LIBDIR = $(LIBDIRREL) +endif + +#=============================================================================== +#=== Makefile TARGETS and build rules below +#=============================================================================== + +# NB1: there are no CUDA targets in src as we avoid RDC! +# NB2: CUDA includes for curand.h are no longer needed in the C++ code anywhere in src! + +MG5AMC_COMMONLIB = mg5amc_common + +# First target (default goal) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so + +# Target (and build options): debug +debug: OPTFLAGS = -g -O0 -DDEBUG2 +debug: all.$(TAG) + +# Target: tag-specific build lockfiles +override oldtagsb=`if [ -d $(BUILDDIR) ]; then find $(BUILDDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi` +override oldtagsl=`if [ -d $(LIBDIR) ]; then find $(LIBDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi` + +$(BUILDDIR)/.build.$(TAG): $(LIBDIR)/.build.$(TAG) + +$(LIBDIR)/.build.$(TAG): + @if [ "$(oldtagsl)" != "" ]; then echo -e "Cannot build for tag=$(TAG) as old builds exist in $(LIBDIR) for other tags:\n$(oldtagsl)\nPlease run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi + @if [ "$(oldtagsb)" != "" ]; then echo -e "Cannot build for tag=$(TAG) as old builds exist in $(BUILDDIR) for other tags:\n$(oldtagsb)\nPlease run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + @touch $(LIBDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + @touch $(BUILDDIR)/.build.$(TAG) + +#------------------------------------------------------------------------------- + +# Generic target and build rules: objects from C++ compilation +$(BUILDDIR)/%.o : %.cc *.h + @if [ ! -d $(BUILDDIR) ]; then mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ + +#------------------------------------------------------------------------------- + +cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2.o read_slha.o) + +# Target (and build rules): common (src) library +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(CXX) -shared -o$@ $(cxx_objects) + +#------------------------------------------------------------------------------- + +# Target: clean the builds +.PHONY: clean + +clean: +ifeq ($(USEBUILDDIR),1) + rm -rf $(LIBDIR) + rm -rf $(BUILDDIR) +else + rm -f $(LIBDIR)/.build.* $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so + rm -f $(BUILDDIR)/.build.* $(BUILDDIR)/*.o $(BUILDDIR)/*.exe +endif + +cleanall: + @echo + $(MAKE) clean -f $(THISMK) + @echo + rm -rf $(LIBDIR)/build.* + rm -rf build.* + +#------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h new file mode 100644 index 0000000000..6f939b6d4f --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h @@ -0,0 +1,234 @@ +#ifndef MGONGPUCONFIG_H +#define MGONGPUCONFIG_H 1 + +// HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473) +// There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) +#undef MGONGPU_SUPPORTS_MULTICHANNEL + +// ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" +// ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) + +// Choose if curand is supported for generating random numbers +// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND +#ifdef __CUDACC__ +#undef MGONGPU_HAS_NO_CURAND +#else +//#undef MGONGPU_HAS_NO_CURAND // default +////#define MGONGPU_HAS_NO_CURAND 1 +#endif + +// Choose floating point precision (for everything but color algebra #537) +// If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) +#if not defined MGONGPU_FPTYPE_DOUBLE and not defined MGONGPU_FPTYPE_FLOAT +// Floating point precision (CHOOSE ONLY ONE) +#define MGONGPU_FPTYPE_DOUBLE 1 // default +//#define MGONGPU_FPTYPE_FLOAT 1 // 2x faster +#endif + +// Choose floating point precision (for color algebra alone #537) +// If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE2_FLOAT, nothing happens (issue #167) +#if not defined MGONGPU_FPTYPE2_DOUBLE and not defined MGONGPU_FPTYPE2_FLOAT +// Floating point precision (CHOOSE ONLY ONE) +#define MGONGPU_FPTYPE2_DOUBLE 1 // default +//#define MGONGPU_FPTYPE2_FLOAT 1 // 2x faster +#endif + +// Choose whether to inline all HelAmps functions +// This optimization can gain almost a factor 4 in C++, similar to -flto (issue #229) +// By default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_INLINE_HELAMPS +//#undef MGONGPU_INLINE_HELAMPS // default +////#define MGONGPU_INLINE_HELAMPS 1 + +// Choose whether to hardcode the cIPD physics parameters rather than reading them from user cards +// This optimization can gain 20% in CUDA in eemumu (issue #39) +// By default, do not hardcode, but allow this macro to be set from outside with e.g. -DMGONGPU_HARDCODE_PARAM +// ** NB: The option to use hardcoded cIPD physics parameters is supported again even now when alphas is running (#373) +// ** NB: Note however that it now only refers to cIPD parameters (cIPC parameters are always accessed through global memory) +//#undef MGONGPU_HARDCODE_PARAM // default +////#define MGONGPU_HARDCODE_PARAM 1 + +// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#ifndef __CUDACC__ +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) +#endif + +// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +#ifdef __CUDACC__ +#define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) +//#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) +//#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#endif + +// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +#ifdef __CUDACC__ +#undef MGONGPU_NSIGHT_DEBUG // default +//#define MGONGPU_NSIGHT_DEBUG 1 +#endif + +// SANITY CHECKS (floating point precision for everything but color algebra #537) +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE_FLOAT +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_FPTYPE_DOUBLE or defined MGONGPU_FPTYPE_FLOAT +#endif + +// SANITY CHECKS (floating point precision for color algebra alone #537) +#if defined MGONGPU_FPTYPE2_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_FPTYPE2_DOUBLE or defined MGONGPU_FPTYPE2_FLOAT +#endif +#if defined MGONGPU_FPTYPE2_DOUBLE and defined MGONGPU_FPTYPE_FLOAT +#error You cannot use double precision for color algebra and single precision elsewhere +#endif + +// SANITY CHECKS (c++ complex number implementation) +#ifndef __CUDACC__ +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +#endif +#endif + +// SANITY CHECKS (cuda complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +#endif +#endif + +namespace mgOnGpu +{ + + // --- Type definitions + + // Floating point type (for everything but color algebra #537): fptype +#if defined MGONGPU_FPTYPE_DOUBLE + typedef double fptype; // double precision (8 bytes, fp64) +#elif defined MGONGPU_FPTYPE_FLOAT + typedef float fptype; // single precision (4 bytes, fp32) +#endif + + // Floating point type (for color algebra alone #537): fptype2 +#if defined MGONGPU_FPTYPE2_DOUBLE + typedef double fptype2; // double precision (8 bytes, fp64) +#elif defined MGONGPU_FPTYPE2_FLOAT + typedef float fptype2; // single precision (4 bytes, fp32) +#endif + + // --- Physics process-specific constants that are best declared at compile time + + const int np4 = 4; // dimensions of 4-momenta (E,px,py,pz) + + const int npari = 2; // #particles in the initial state (incoming): e.g. 2 (e+ e-) for e+ e- -> mu+ mu- + + const int nparf = 2; // #particles in the final state (outgoing): e.g. 2 (mu+ mu-) for e+ e- -> mu+ mu- + + const int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + + const int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + + const int nw6 = 6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + + const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + + // --- Platform-specific software implementation details + + // Maximum number of blocks per grid + // ** NB Some arrays of pointers will be allocated statically to fit all these blocks + // ** (the actual memory for each block will then be allocated dynamically only for existing blocks) + //const int nbpgMAX = 2048; + + // Maximum number of threads per block + //const int ntpbMAX = 256; // AV Apr2021: why had I set this to 256? + const int ntpbMAX = 1024; // NB: 512 is ok, but 1024 does fail with "too many resources requested for launch" + + // Alignment requirement for using reinterpret_cast with SIMD vectorized code + // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) + // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) +#ifndef __CUDACC__ + constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) +#endif + +} + +// Expose typedefs and operators outside the namespace +using mgOnGpu::fptype; +using mgOnGpu::fptype2; + +// C++ SIMD vectorization width (this will be used to set neppV) +#ifdef __CUDACC__ // CUDA implementation has no SIMD +#undef MGONGPU_CPPSIMD +#elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) +#ifdef MGONGPU_FPTYPE_DOUBLE +#define MGONGPU_CPPSIMD 8 +#else +#define MGONGPU_CPPSIMD 16 +#endif +#elif defined __AVX512VL__ // C++ "512y" AVX512 with 256 width (256-bit ie 32-byte): 4 (DOUBLE) or 8 (FLOAT) [gcc DEFAULT] +#ifdef MGONGPU_FPTYPE_DOUBLE +#define MGONGPU_CPPSIMD 4 +#else +#define MGONGPU_CPPSIMD 8 +#endif +#elif defined __AVX2__ // C++ "avx2" AVX2 (256-bit ie 32-byte): 4 (DOUBLE) or 8 (FLOAT) [clang DEFAULT] +#ifdef MGONGPU_FPTYPE_DOUBLE +#define MGONGPU_CPPSIMD 4 +#else +#define MGONGPU_CPPSIMD 8 +#endif +#elif defined __SSE4_2__ // C++ "sse4" SSE4.2 (128-bit ie 16-byte): 2 (DOUBLE) or 4 (FLOAT) [Power9 and ARM default] +#ifdef MGONGPU_FPTYPE_DOUBLE +#define MGONGPU_CPPSIMD 2 +#else +#define MGONGPU_CPPSIMD 4 +#endif +#else // C++ "none" i.e. no SIMD +#undef MGONGPU_CPPSIMD +#endif + +// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) +#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; +#define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } +#define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } +#define mgDebugFinalise() { if ( blockIdx.x == 0 && threadIdx.x == 0 ) printf( "MGDEBUG: counter=%f\n", mgDebugCounter[threadIdx.x] ); } +#else +#define mgDebugDeclare() /*noop*/ +#define mgDebugInitialise() { /*noop*/ } +#define mgDebug( code, text ) { /*noop*/ } +#define mgDebugFinalise() { /*noop*/ } +#endif /* clang-format on */ + +// Define empty CUDA declaration specifiers for C++ +#ifndef __CUDACC__ +#define __global__ +#define __host__ +#define __device__ +#endif + +// For SANITY CHECKS: check that neppR, neppM, neppV... are powers of two (https://stackoverflow.com/a/108360) +inline constexpr bool +ispoweroftwo( int n ) +{ + return ( n > 0 ) && !( n & ( n - 1 ) ); +} + +// Compiler version support (#96): require nvcc from CUDA >= 11.2, e.g. to use C++17 (see #333) +#ifdef __NVCC__ +#if( __CUDACC_VER_MAJOR__ < 11 ) || ( __CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 2 ) +#error Unsupported CUDA version: please use CUDA >= 11.2 +#endif +#endif + +// Compiler version support (#96): require clang >= 11 +#if defined __clang__ +#if( __clang_major__ < 11 ) +#error Unsupported clang version: please use clang >= 11 +#endif +// Compiler version support (#96): require gcc >= 9.3, e.g. for some OMP issues (see #269) +// [NB skip this check for the gcc toolchain below clang or icx (TEMPORARY? #355)] +#elif defined __GNUC__ +#if( __GNUC__ < 9 ) || ( __GNUC__ == 9 && __GNUC_MINOR__ < 3 ) +#error Unsupported gcc version: please gcc >= 9.3 +#endif +#endif + +#endif // MGONGPUCONFIG_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h new file mode 100644 index 0000000000..caff927311 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h @@ -0,0 +1,633 @@ +#ifndef MGONGPUCXTYPES_H +#define MGONGPUCXTYPES_H 1 + +#include "mgOnGpuConfig.h" + +#include "mgOnGpuFptypes.h" + +#include + +//========================================================================== +// COMPLEX TYPES: (PLATFORM-SPECIFIC) HEADERS +//========================================================================== + +#include + +// Complex type in cuda: thrust or cucomplex or cxsmpl +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) +#include +#pragma clang diagnostic pop +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX +#include +#elif not defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +#endif +#else +// Complex type in c++: std::complex or cxsmpl +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#include +#elif not defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +#endif +#endif + +//========================================================================== +// COMPLEX TYPES: SIMPLE COMPLEX CLASS (cxsmpl) +//========================================================================== + +namespace mgOnGpu /* clang-format off */ +{ + // --- Type definition (simple complex type derived from cxtype_v) + template + class cxsmpl + { + public: + __host__ __device__ constexpr cxsmpl() : m_real( 0 ), m_imag( 0 ) {} + cxsmpl( const cxsmpl& ) = default; + cxsmpl( cxsmpl&& ) = default; + __host__ __device__ constexpr cxsmpl( const FP& r, const FP& i = 0 ) : m_real( r ), m_imag( i ) {} + __host__ __device__ constexpr cxsmpl( const std::complex& c ) : m_real( c.real() ), m_imag( c.imag() ) {} + cxsmpl& operator=( const cxsmpl& ) = default; + cxsmpl& operator=( cxsmpl&& ) = default; + __host__ __device__ constexpr cxsmpl& operator+=( const cxsmpl& c ) { m_real += c.real(); m_imag += c.imag(); return *this; } + __host__ __device__ constexpr cxsmpl& operator-=( const cxsmpl& c ) { m_real -= c.real(); m_imag -= c.imag(); return *this; } + __host__ __device__ constexpr const FP& real() const { return m_real; } + __host__ __device__ constexpr const FP& imag() const { return m_imag; } + //constexpr operator std::complex() const { return std::complex( m_real, m_imag ); } // cxsmpl to std::complex (float-to-float or double-to-double) + private: + FP m_real, m_imag; // RI + }; + + template + inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + conj( const cxsmpl& c ) + { + return cxsmpl( c.real(), -c.imag() ); + } +} /* clang-format on */ + +// Expose the cxsmpl class outside the namespace +using mgOnGpu::cxsmpl; + +// Printout to stream for user defined types +template +inline __host__ __device__ std::ostream& +operator<<( std::ostream& out, const cxsmpl& c ) +{ + out << std::complex( c.real(), c.imag() ); + return out; +} + +// Operators for cxsmpl +template +inline __host__ __device__ constexpr cxsmpl +operator+( const cxsmpl a ) +{ + return a; +} + +template +inline __host__ __device__ constexpr cxsmpl +operator-( const cxsmpl& a ) +{ + return cxsmpl( -a.real(), -a.imag() ); +} + +template +inline __host__ __device__ constexpr cxsmpl +operator+( const cxsmpl& a, const cxsmpl& b ) +{ + return cxsmpl( a.real() + b.real(), a.imag() + b.imag() ); +} + +template +inline __host__ __device__ constexpr cxsmpl +operator+( const FP& a, const cxsmpl& b ) +{ + return cxsmpl( a, 0 ) + b; +} + +template +inline __host__ __device__ constexpr cxsmpl +operator-( const cxsmpl& a, const cxsmpl& b ) +{ + return cxsmpl( a.real() - b.real(), a.imag() - b.imag() ); +} + +template +inline __host__ __device__ constexpr cxsmpl +operator-( const FP& a, const cxsmpl& b ) +{ + return cxsmpl( a, 0 ) - b; +} + +template +inline __host__ __device__ constexpr cxsmpl +operator*( const cxsmpl& a, const cxsmpl& b ) +{ + return cxsmpl( a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag() ); +} + +template +inline __host__ __device__ constexpr cxsmpl +operator*( const FP& a, const cxsmpl& b ) +{ + return cxsmpl( a, 0 ) * b; +} + +inline __host__ __device__ constexpr cxsmpl +operator*( const double& a, const cxsmpl& b ) +{ + return cxsmpl( a, 0 ) * b; +} + +template +inline __host__ __device__ constexpr cxsmpl +operator/( const cxsmpl& a, const cxsmpl& b ) +{ + FP bnorm = b.real() * b.real() + b.imag() * b.imag(); + return cxsmpl( ( a.real() * b.real() + a.imag() * b.imag() ) / bnorm, + ( a.imag() * b.real() - a.real() * b.imag() ) / bnorm ); +} + +template +inline __host__ __device__ constexpr cxsmpl +operator/( const FP& a, const cxsmpl& b ) +{ + return cxsmpl( a, 0 ) / b; +} + +template +inline __host__ __device__ constexpr cxsmpl +operator+( const cxsmpl& a, const FP& b ) +{ + return a + cxsmpl( b, 0 ); +} + +template +inline __host__ __device__ constexpr cxsmpl +operator-( const cxsmpl& a, const FP& b ) +{ + return a - cxsmpl( b, 0 ); +} + +template +inline __host__ __device__ constexpr cxsmpl +operator*( const cxsmpl& a, const FP& b ) +{ + return a * cxsmpl( b, 0 ); +} + +template +inline __host__ __device__ constexpr cxsmpl +operator/( const cxsmpl& a, const FP& b ) +{ + return a / cxsmpl( b, 0 ); +} + +//========================================================================== +// COMPLEX TYPES: (PLATFORM-SPECIFIC) TYPEDEFS +//========================================================================== + +namespace mgOnGpu +{ + + // --- Type definitions (complex type: cxtype) +#ifdef __CUDACC__ // cuda +#if defined MGONGPU_CUCXTYPE_THRUST + typedef thrust::complex cxtype; +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX +#if defined MGONGPU_FPTYPE_DOUBLE + typedef cuDoubleComplex cxtype; +#elif defined MGONGPU_FPTYPE_FLOAT + typedef cuFloatComplex cxtype; +#endif +#else + typedef cxsmpl cxtype; +#endif +#else // c++ +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX + typedef std::complex cxtype; +#else + typedef cxsmpl cxtype; +#endif +#endif + + // The number of floating point types in a complex type (real, imaginary) + constexpr int nx2 = 2; + + // SANITY CHECK: memory access may be based on casts of fptype[2] to cxtype (e.g. for wavefunctions) + static_assert( sizeof( cxtype ) == nx2 * sizeof( fptype ), "sizeof(cxtype) is not 2*sizeof(fptype)" ); +} + +// Expose typedefs and operators outside the namespace +using mgOnGpu::cxtype; + +//========================================================================== +// COMPLEX TYPES: (PLATFORM-SPECIFIC) FUNCTIONS AND OPERATORS +//========================================================================== + +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + +//------------------------------ +// CUDA or C++ - using cxsmpl +//------------------------------ + +inline __host__ __device__ cxtype +cxmake( const fptype& r, const fptype& i ) +{ + return cxtype( r, i ); // cxsmpl constructor +} + +inline __host__ __device__ fptype +cxreal( const cxtype& c ) +{ + return c.real(); // cxsmpl::real() +} + +inline __host__ __device__ fptype +cximag( const cxtype& c ) +{ + return c.imag(); // cxsmpl::imag() +} + +inline __host__ __device__ cxtype +cxconj( const cxtype& c ) +{ + return conj( c ); // conj( cxsmpl ) +} + +inline __host__ cxtype // NOT __device__ +cxmake( const std::complex& c ) // std::complex to cxsmpl (float-to-float or float-to-double) +{ + return cxmake( c.real(), c.imag() ); +} + +inline __host__ cxtype // NOT __device__ +cxmake( const std::complex& c ) // std::complex to cxsmpl (double-to-float or double-to-double) +{ + return cxmake( c.real(), c.imag() ); +} + +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + +//========================================================================== + +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust + +//------------------------------ +// CUDA - using thrust::complex +//------------------------------ + +inline __host__ __device__ cxtype +cxmake( const fptype& r, const fptype& i ) +{ + return cxtype( r, i ); // thrust::complex constructor +} + +inline __host__ __device__ fptype +cxreal( const cxtype& c ) +{ + return c.real(); // thrust::complex::real() +} + +inline __host__ __device__ fptype +cximag( const cxtype& c ) +{ + return c.imag(); // thrust::complex::imag() +} + +inline __host__ __device__ cxtype +cxconj( const cxtype& c ) +{ + return conj( c ); // conj( thrust::complex ) +} + +inline __host__ __device__ const cxtype& +cxmake( const cxtype& c ) +{ + return c; +} + +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST + +//========================================================================== + +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex + +//------------------------------ +// CUDA - using cuComplex +//------------------------------ + +#if defined MGONGPU_FPTYPE_DOUBLE // cuda + cucomplex + double + +//+++++++++++++++++++++++++ +// cuDoubleComplex ONLY +//+++++++++++++++++++++++++ + +inline __host__ __device__ cxtype +cxmake( const fptype& r, const fptype& i ) +{ + return make_cuDoubleComplex( r, i ); +} + +inline __host__ __device__ fptype +cxreal( const cxtype& c ) +{ + return cuCreal( c ); // returns by value +} + +inline __host__ __device__ fptype +cximag( const cxtype& c ) +{ + return cuCimag( c ); // returns by value +} + +inline __host__ __device__ cxtype +operator+( const cxtype& a, const cxtype& b ) +{ + return cuCadd( a, b ); +} + +inline __host__ __device__ cxtype& +operator+=( cxtype& a, const cxtype& b ) +{ + a = cuCadd( a, b ); + return a; +} + +inline __host__ __device__ cxtype +operator-( const cxtype& a, const cxtype& b ) +{ + return cuCsub( a, b ); +} + +inline __host__ __device__ cxtype& +operator-=( cxtype& a, const cxtype& b ) +{ + a = cuCsub( a, b ); + return a; +} + +inline __host__ __device__ cxtype +operator*( const cxtype& a, const cxtype& b ) +{ + return cuCmul( a, b ); +} + +inline __host__ __device__ cxtype +operator/( const cxtype& a, const cxtype& b ) +{ + return cuCdiv( a, b ); +} + +#elif defined MGONGPU_FPTYPE_FLOAT // cuda + cucomplex + float + +//+++++++++++++++++++++++++ +// cuFloatComplex ONLY +//+++++++++++++++++++++++++ + +inline __host__ __device__ cxtype +cxmake( const fptype& r, const fptype& i ) +{ + return make_cuFloatComplex( r, i ); +} + +inline __host__ __device__ fptype +cxreal( const cxtype& c ) +{ + return cuCrealf( c ); // returns by value +} + +inline __host__ __device__ fptype +cximag( const cxtype& c ) +{ + return cuCimagf( c ); // returns by value +} + +inline __host__ __device__ cxtype +operator+( const cxtype& a, const cxtype& b ) +{ + return cuCaddf( a, b ); +} + +inline __host__ __device__ cxtype& +operator+=( cxtype& a, const cxtype& b ) +{ + a = cuCaddf( a, b ); + return a; +} + +inline __host__ __device__ cxtype +operator-( const cxtype& a, const cxtype& b ) +{ + return cuCsubf( a, b ); +} + +inline __host__ __device__ cxtype& +operator-=( cxtype& a, const cxtype& b ) +{ + a = cuCsubf( a, b ); + return a; +} + +inline __host__ __device__ cxtype +operator*( const cxtype& a, const cxtype& b ) +{ + return cuCmulf( a, b ); +} + +inline __host__ __device__ cxtype +operator/( const cxtype& a, const cxtype& b ) +{ + return cuCdivf( a, b ); +} + +inline __host__ cxtype // NOT __device__ +cxmake( const std::complex& c ) // std::complex to cucomplex (cast double-to-float) +{ + return cxmake( (fptype)c.real(), (fptype)c.imag() ); +} + +#endif + +//+++++++++++++++++++++++++ +// cuDoubleComplex OR +// cuFloatComplex +//+++++++++++++++++++++++++ + +inline __host__ __device__ cxtype +operator+( const cxtype a ) +{ + return a; +} + +inline __host__ __device__ cxtype +operator-( const cxtype& a ) +{ + return cxmake( -cxreal( a ), -cximag( a ) ); +} + +inline __host__ __device__ cxtype +operator+( const fptype& a, const cxtype& b ) +{ + return cxmake( a, 0 ) + b; +} + +inline __host__ __device__ cxtype +operator-( const fptype& a, const cxtype& b ) +{ + return cxmake( a, 0 ) - b; +} + +inline __host__ __device__ cxtype +operator*( const fptype& a, const cxtype& b ) +{ + return cxmake( a, 0 ) * b; +} + +inline __host__ __device__ cxtype +operator/( const fptype& a, const cxtype& b ) +{ + return cxmake( a, 0 ) / b; +} + +inline __host__ __device__ cxtype +operator+( const cxtype& a, const fptype& b ) +{ + return a + cxmake( b, 0 ); +} + +inline __host__ __device__ cxtype +operator-( const cxtype& a, const fptype& b ) +{ + return a - cxmake( b, 0 ); +} + +inline __host__ __device__ cxtype +operator*( const cxtype& a, const fptype& b ) +{ + return a * cxmake( b, 0 ); +} + +inline __host__ __device__ cxtype +operator/( const cxtype& a, const fptype& b ) +{ + return a / cxmake( b, 0 ); +} + +inline __host__ __device__ cxtype +cxconj( const cxtype& c ) +{ + return cxmake( cxreal( c ), -cximag( c ) ); +} + +inline __host__ cxtype // NOT __device__ +cxmake( const std::complex& c ) // std::complex to cucomplex (float-to-float or double-to-double) +{ + return cxmake( c.real(), c.imag() ); +} + +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX + +//========================================================================== + +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex + +//------------------------------ +// C++ - using std::complex +//------------------------------ + +inline cxtype +cxmake( const fptype& r, const fptype& i ) +{ + return cxtype( r, i ); // std::complex constructor +} + +inline fptype +cxreal( const cxtype& c ) +{ + return c.real(); // std::complex::real() +} + +inline fptype +cximag( const cxtype& c ) +{ + return c.imag(); // std::complex::imag() +} + +inline cxtype +cxconj( const cxtype& c ) +{ + return conj( c ); // conj( std::complex ) +} + +inline const cxtype& +cxmake( const cxtype& c ) // std::complex to std::complex (float-to-float or double-to-double) +{ + return c; +} + +#if defined MGONGPU_FPTYPE_FLOAT +inline cxtype +cxmake( const std::complex& c ) // std::complex to std::complex (cast double-to-float) +{ + return cxmake( (fptype)c.real(), (fptype)c.imag() ); +} +#endif + +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX + +//========================================================================== + +inline __host__ __device__ const cxtype +cxmake( const cxsmpl& c ) // cxsmpl to cxtype (float-to-float or float-to-double) +{ + return cxmake( c.real(), c.imag() ); +} + +inline __host__ __device__ const cxtype +cxmake( const cxsmpl& c ) // cxsmpl to cxtype (double-to-float or double-to-double) +{ + return cxmake( c.real(), c.imag() ); +} + +//========================================================================== +// COMPLEX TYPES: WRAPPER OVER RI FLOATING POINT PAIR (cxtype_ref) +//========================================================================== + +namespace mgOnGpu /* clang-format off */ +{ + // The cxtype_ref class (a non-const reference to two fp variables) was originally designed for cxtype_v::operator[] + // It used to be included in the code only when MGONGPU_HAS_CPPCXTYPEV_BRK (originally MGONGPU_HAS_CPPCXTYPE_REF) is defined + // It is now always included in the code because it is needed also to access an fptype wavefunction buffer as a cxtype + class cxtype_ref + { + public: + cxtype_ref() = delete; + cxtype_ref( const cxtype_ref& ) = delete; + cxtype_ref( cxtype_ref&& ) = default; // copy refs + __host__ __device__ cxtype_ref( fptype& r, fptype& i ) : m_preal( &r ), m_pimag( &i ) {} // copy refs + cxtype_ref& operator=( const cxtype_ref& ) = delete; + //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; } // copy values + __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } + private: + fptype *m_preal, *m_pimag; // RI + }; +} /* clang-format on */ + +// Printout to stream for user defined types +inline __host__ __device__ std::ostream& +operator<<( std::ostream& out, const mgOnGpu::cxtype_ref& c ) +{ + out << (cxtype)c; + return out; +} + +//========================================================================== + +#endif // MGONGPUCXTYPES_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuFptypes.h new file mode 100644 index 0000000000..b278275f80 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuFptypes.h @@ -0,0 +1,87 @@ +#ifndef MGONGPUFPTYPES_H +#define MGONGPUFPTYPES_H 1 + +#include "mgOnGpuConfig.h" + +#include +#include + +//========================================================================== + +#ifdef __CUDACC__ // cuda + +//------------------------------ +// Floating point types - Cuda +//------------------------------ + +/* +inline __host__ __device__ fptype +fpmax( const fptype& a, const fptype& b ) +{ + return max( a, b ); +} + +inline __host__ __device__ fptype +fpmin( const fptype& a, const fptype& b ) +{ + return min( a, b ); +} +*/ + +inline __host__ __device__ const fptype& +fpmax( const fptype& a, const fptype& b ) +{ + return ( ( b < a ) ? a : b ); +} + +inline __host__ __device__ const fptype& +fpmin( const fptype& a, const fptype& b ) +{ + return ( ( a < b ) ? a : b ); +} + +inline __host__ __device__ fptype +fpsqrt( const fptype& f ) +{ +#if defined MGONGPU_FPTYPE_FLOAT + // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html + return sqrtf( f ); +#else + // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__DOUBLE.html + return sqrt( f ); +#endif +} + +#endif // #ifdef __CUDACC__ + +//========================================================================== + +#ifndef __CUDACC__ + +//------------------------------ +// Floating point types - C++ +//------------------------------ + +inline const fptype& +fpmax( const fptype& a, const fptype& b ) +{ + return std::max( a, b ); +} + +inline const fptype& +fpmin( const fptype& a, const fptype& b ) +{ + return std::min( a, b ); +} + +inline fptype +fpsqrt( const fptype& f ) +{ + return std::sqrt( f ); +} + +#endif // #ifndef __CUDACC__ + +//========================================================================== + +#endif // MGONGPUFPTYPES_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h new file mode 100644 index 0000000000..0dd4c69bd4 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h @@ -0,0 +1,829 @@ +#ifndef MGONGPUVECTORS_H +#define MGONGPUVECTORS_H 1 + +#include "mgOnGpuCxtypes.h" +#include "mgOnGpuFptypes.h" + +#include + +//========================================================================== + +//------------------------------ +// Vector types - C++ +//------------------------------ + +#ifdef __clang__ +// If set: return a pair of (fptype&, fptype&) by non-const reference in cxtype_v::operator[] +// This is forbidden in clang ("non-const reference cannot bind to vector element") +// See also https://stackoverflow.com/questions/26554829 +//#define MGONGPU_HAS_CPPCXTYPEV_BRK 1 // clang test (compilation fails also on clang 12.0, issue #182) +#undef MGONGPU_HAS_CPPCXTYPEV_BRK // clang default +#elif defined __INTEL_COMPILER +//#define MGONGPU_HAS_CPPCXTYPEV_BRK 1 // icc default? +#undef MGONGPU_HAS_CPPCXTYPEV_BRK // icc test +#else +#define MGONGPU_HAS_CPPCXTYPEV_BRK 1 // gcc default +//#undef MGONGPU_HAS_CPPCXTYPEV_BRK // gcc test (very slightly slower? issue #172) +#endif + +namespace mgOnGpu /* clang-format off */ +{ +#ifdef MGONGPU_CPPSIMD + + const int neppV = MGONGPU_CPPSIMD; + + // SANITY CHECK: cppAlign must be a multiple of neppV * sizeof(fptype) + static_assert( mgOnGpu::cppAlign % ( neppV * sizeof( fptype ) ) == 0 ); + + // SANITY CHECK: check that neppV is a power of two + static_assert( ispoweroftwo( neppV ), "neppV is not a power of 2" ); + + // --- Type definition (using vector compiler extensions: need -march=...) + // For gcc: https://gcc.gnu.org/onlinedocs/gcc/Vector-Extensions.html + // For clang: https://clang.llvm.org/docs/LanguageExtensions.html#vectors-and-extended-vectors +#ifdef __clang__ + typedef fptype fptype_v __attribute__( ( ext_vector_type( neppV ) ) ); // RRRR +#else + typedef fptype fptype_v __attribute__( ( vector_size( neppV * sizeof( fptype ) ) ) ); // RRRR +#endif + + // Mixed fptypes #537: float for color algebra and double elsewhere +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT + const int neppV2 = MGONGPU_CPPSIMD * 2; + static_assert( mgOnGpu::cppAlign % ( neppV2 * sizeof( fptype2 ) ) == 0 ); + static_assert( ispoweroftwo( neppV2 ), "neppV2 is not a power of 2" ); +#ifdef __clang__ + typedef fptype2 fptype2_v __attribute__( ( ext_vector_type( neppV2 ) ) ); // RRRRRRRR +#else + typedef fptype2 fptype2_v __attribute__( ( vector_size( neppV2 * sizeof( fptype2 ) ) ) ); // RRRRRRRR +#endif +#else + typedef fptype_v fptype2_v; +#endif + + // --- Type definition (using vector compiler extensions: need -march=...) + class cxtype_v // no need for "class alignas(2*sizeof(fptype_v)) cxtype_v" + { + public: + // Array initialization: zero-out as "{0}" (C and C++) or as "{}" (C++ only) + // See https://en.cppreference.com/w/c/language/array_initialization#Notes + cxtype_v() : m_real{ 0 }, m_imag{ 0 } {} // RRRR=0000 IIII=0000 + cxtype_v( const cxtype_v& ) = default; + cxtype_v( cxtype_v&& ) = default; + cxtype_v( const fptype_v& r, const fptype_v& i ) : m_real( r ), m_imag( i ) {} + cxtype_v( const fptype_v& r ) : m_real( r ), m_imag{ 0 } {} // IIII=0000 + cxtype_v& operator=( const cxtype_v& ) = default; + cxtype_v& operator=( cxtype_v&& ) = default; + cxtype_v& operator+=( const cxtype_v& c ) { m_real += c.real(); m_imag += c.imag(); return *this; } + cxtype_v& operator-=( const cxtype_v& c ) { m_real -= c.real(); m_imag -= c.imag(); return *this; } +#ifdef MGONGPU_HAS_CPPCXTYPEV_BRK + // NB: THIS IS THE FUNDAMENTAL DIFFERENCE BETWEEN MGONGPU_HAS_CPPCXTYPEV_BRK DEFINED AND NOT DEFINED + // NB: the alternative "clang" implementation is simpler: it simply does not have any bracket operator[] + // NB: ** do NOT implement operator[] to return a value: it does not fail the build (why?) and gives unexpected results! ** + cxtype_ref operator[]( size_t i ) const { return cxtype_ref( m_real[i], m_imag[i] ); } +#endif + const fptype_v& real() const { return m_real; } + const fptype_v& imag() const { return m_imag; } + private: + fptype_v m_real, m_imag; // RRRRIIII + }; + + // --- Type definition (using vector compiler extensions: need -march=...) +#ifdef __clang__ // https://clang.llvm.org/docs/LanguageExtensions.html#vectors-and-extended-vectors +#if defined MGONGPU_FPTYPE_DOUBLE + typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb +#elif defined MGONGPU_FPTYPE_FLOAT + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb +#endif +#else // gcc +#if defined MGONGPU_FPTYPE_DOUBLE + typedef long int bool_v __attribute__( ( vector_size( neppV * sizeof( long int ) ) ) ); // bbbb +#elif defined MGONGPU_FPTYPE_FLOAT + typedef int bool_v __attribute__( ( vector_size( neppV * sizeof( int ) ) ) ); // bbbb +#endif +#endif + +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) + + const int neppV = 1; + +#endif // #ifdef MGONGPU_CPPSIMD + +} /* clang-format on */ + +//-------------------------------------------------------------------------- + +// Expose typedefs outside the namespace +using mgOnGpu::neppV; +#ifdef MGONGPU_CPPSIMD +using mgOnGpu::fptype_v; +using mgOnGpu::fptype2_v; +using mgOnGpu::cxtype_v; +using mgOnGpu::bool_v; +#endif + +//-------------------------------------------------------------------------- + +#ifndef __CUDACC__ + +// Printout to stream for user defined types + +#ifndef MGONGPU_CPPCXTYPE_CXSMPL // operator<< for cxsmpl has already been defined! +inline std::ostream& +operator<<( std::ostream& out, const cxtype& c ) +{ + out << "[" << cxreal( c ) << "," << cximag( c ) << "]"; + //out << cxreal(c) << "+i" << cximag(c); + return out; +} +#endif + +/* +#ifdef MGONGPU_CPPSIMD +inline std::ostream& +operator<<( std::ostream& out, const bool_v& v ) +{ + out << "{ " << v[0]; + for ( int i=1; i +#include +#include + +// Simplified rambo version for 2 to N (with N>=2) processes with massless particles +#ifdef __CUDACC__ +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + using mgOnGpu::np4; + using mgOnGpu::npari; + using mgOnGpu::nparf; + using mgOnGpu::npar; + + //-------------------------------------------------------------------------- + + // Fill in the momenta of the initial particles + // [NB: the output buffer includes both initial and final momenta, but only initial momenta are filled in] + template + __host__ __device__ void + ramboGetMomentaInitial( const fptype energy, // input: energy + fptype* momenta ) // output: momenta for one event or for a set of events + { + const fptype energy1 = energy / 2; + const fptype energy2 = energy / 2; + const fptype mom = energy / 2; + M_ACCESS::kernelAccessIp4Ipar( momenta, 0, 0 ) = energy1; + M_ACCESS::kernelAccessIp4Ipar( momenta, 1, 0 ) = 0; + M_ACCESS::kernelAccessIp4Ipar( momenta, 2, 0 ) = 0; + M_ACCESS::kernelAccessIp4Ipar( momenta, 3, 0 ) = mom; + M_ACCESS::kernelAccessIp4Ipar( momenta, 0, 1 ) = energy2; + M_ACCESS::kernelAccessIp4Ipar( momenta, 1, 1 ) = 0; + M_ACCESS::kernelAccessIp4Ipar( momenta, 2, 1 ) = 0; + M_ACCESS::kernelAccessIp4Ipar( momenta, 3, 1 ) = -mom; + } + + //-------------------------------------------------------------------------- + + // Fill in the momenta of the final particles using the RAMBO algorithm + // [NB: the output buffer includes both initial and final momenta, but only initial momenta are filled in] + template + __host__ __device__ void + ramboGetMomentaFinal( const fptype energy, // input: energy + const fptype* rndmom, // input: random numbers in [0,1] for one event or for a set of events + fptype* momenta, // output: momenta for one event or for a set of events + fptype* wgts ) // output: weights for one event or for a set of events + { + /**************************************************************************** + * rambo * + * ra(ndom) m(omenta) b(eautifully) o(rganized) * + * * + * a democratic multi-particle phase space generator * + * authors: s.d. ellis, r. kleiss, w.j. stirling * + * this is version 1.0 - written by r. kleiss * + * -- adjusted by hans kuijf, weights are logarithmic (1990-08-20) * + * -- adjusted by madgraph@sheffield_gpu_hackathon team (2020-07-29) * + * * + ****************************************************************************/ + + // output weight + fptype& wt = W_ACCESS::kernelAccess( wgts ); + + // AV special case nparf==1 (issue #358) + if constexpr( nparf == 1 ) + { + static bool first = true; + if( first ) + { +#ifdef __CUDACC__ + if constexpr( M_ACCESS::isOnDevice() ) // avoid + { + const int ievt0 = 0; + const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid + if( ievt == ievt0 ) + printf( "WARNING! Rambo called with 1 final particle: random numbers will be ignored\n" ); + } + else +#endif + { + printf( "WARNING! Rambo called with 1 final particle: random numbers will be ignored\n" ); + } + first = false; + } + const int iparf = 0; + for( int i4 = 0; i4 < np4; i4++ ) + { + M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) = 0; + for( int ipari = 0; ipari < npari; ipari++ ) + { + M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) += M_ACCESS::kernelAccessIp4Ipar( momenta, i4, ipari ); + } + } + wt = 1; + return; + } + + // initialization step: factorials for the phase space weight + const fptype twopi = 8. * atan( 1. ); + const fptype po2log = log( twopi / 4. ); + fptype z[nparf]; + if constexpr( nparf > 1 ) // avoid build warning on clang (related to #358) + z[1] = po2log; + for( int kpar = 2; kpar < nparf; kpar++ ) z[kpar] = z[kpar - 1] + po2log - 2. * log( fptype( kpar - 1 ) ); + for( int kpar = 2; kpar < nparf; kpar++ ) z[kpar] = ( z[kpar] - log( fptype( kpar ) ) ); + + // generate n massless momenta in infinite phase space + fptype q[nparf][np4]; + for( int iparf = 0; iparf < nparf; iparf++ ) + { + const fptype r1 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 0, iparf ); + const fptype r2 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 1, iparf ); + const fptype r3 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 2, iparf ); + const fptype r4 = R_ACCESS::kernelAccessIp4IparfConst( rndmom, 3, iparf ); + const fptype c = 2. * r1 - 1.; + const fptype s = sqrt( 1. - c * c ); + const fptype f = twopi * r2; + q[iparf][0] = -log( r3 * r4 ); + q[iparf][3] = q[iparf][0] * c; + q[iparf][2] = q[iparf][0] * s * cos( f ); + q[iparf][1] = q[iparf][0] * s * sin( f ); + } + + // calculate the parameters of the conformal transformation + fptype r[np4]; + fptype b[np4 - 1]; + for( int i4 = 0; i4 < np4; i4++ ) r[i4] = 0.; + for( int iparf = 0; iparf < nparf; iparf++ ) + { + for( int i4 = 0; i4 < np4; i4++ ) r[i4] = r[i4] + q[iparf][i4]; + } + const fptype rmas = sqrt( pow( r[0], 2 ) - pow( r[3], 2 ) - pow( r[2], 2 ) - pow( r[1], 2 ) ); + for( int i4 = 1; i4 < np4; i4++ ) b[i4 - 1] = -r[i4] / rmas; + const fptype g = r[0] / rmas; + const fptype a = 1. / ( 1. + g ); + const fptype x0 = energy / rmas; + + // transform the q's conformally into the p's (i.e. the 'momenta') + for( int iparf = 0; iparf < nparf; iparf++ ) + { + fptype bq = b[0] * q[iparf][1] + b[1] * q[iparf][2] + b[2] * q[iparf][3]; + for( int i4 = 1; i4 < np4; i4++ ) + { + M_ACCESS::kernelAccessIp4Ipar( momenta, i4, iparf + npari ) = x0 * ( q[iparf][i4] + b[i4 - 1] * ( q[iparf][0] + a * bq ) ); + } + M_ACCESS::kernelAccessIp4Ipar( momenta, 0, iparf + npari ) = x0 * ( g * q[iparf][0] + bq ); + } + + // calculate weight (NB return log of weight) + wt = po2log; + if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; + +#ifndef __CUDACC__ + // issue warnings if weight is too small or too large + static int iwarn[5] = { 0, 0, 0, 0, 0 }; + if( wt < -180. ) + { + if( iwarn[0] <= 5 ) std::cout << "Too small wt, risk for underflow: " << wt << std::endl; + iwarn[0] = iwarn[0] + 1; + } + if( wt > 174. ) + { + if( iwarn[1] <= 5 ) std::cout << "Too large wt, risk for overflow: " << wt << std::endl; + iwarn[1] = iwarn[1] + 1; + } +#endif + + // return for weighted massless momenta + // nothing else to do in this event if all particles are massless (nm==0) + + return; + } + + //-------------------------------------------------------------------------- +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.cc b/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.cc new file mode 100644 index 0000000000..2934e3a476 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.cc @@ -0,0 +1,184 @@ +#include "read_slha.h" + +#include +#include +#include +#include +#include + +void +SLHABlock::set_entry( std::vector indices, double value ) +{ + if( _entries.size() == 0 ) + _indices = indices.size(); + else if( indices.size() != _indices ) + throw "Wrong number of indices in set_entry"; + + _entries[indices] = value; +} + +double +SLHABlock::get_entry( std::vector indices, double def_val ) +{ + if( _entries.find( indices ) == _entries.end() ) + { + std::cout << "Warning: No such entry in " << _name << ", using default value " + << def_val << std::endl; + return def_val; + } + return _entries[indices]; +} + +void +SLHAReader::read_slha_file( std::string file_name, bool verbose ) +{ + std::ifstream param_card; + param_card.open( file_name.c_str(), std::ifstream::in ); + if( param_card.good() ) + { + if( verbose ) std::cout << "Opened slha file " << file_name << " for reading" << std::endl; + } + else + { + const char envpath[] = "MG5AMC_CARD_PATH"; + if( !getenv( envpath ) ) + { + std::cout << "ERROR! Card file '" << file_name << "' does not exist" + << " and environment variable '" << envpath << "' is not set" << std::endl; + throw "Error while opening param card"; + } + else + { + std::cout << "WARNING! Card file '" << file_name << "' does not exist:" + << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); + param_card.open( file_name2.c_str(), std::ifstream::in ); + if( param_card.good() ) + { + std::cout << "Opened slha file " << file_name2 << " for reading" << std::endl; + } + else + { + std::cout << "ERROR! Card file '" << file_name2 << "' does not exist" << std::endl; + throw "Error while opening param card"; + } + } + } + char buf[200]; + std::string line; + std::string block( "" ); + while( param_card.good() ) + { + param_card.getline( buf, 200 ); + line = buf; + // Change to lowercase + transform( line.begin(), line.end(), line.begin(), (int ( * )( int ))tolower ); + if( line != "" && line[0] != '#' ) + { + if( block != "" ) + { + // Look for double index blocks + double dindex1, dindex2; + double value; + std::stringstream linestr2( line ); + if( linestr2 >> dindex1 >> dindex2 >> value && + dindex1 == int( dindex1 ) and dindex2 == int( dindex2 ) ) + { + std::vector indices; + indices.push_back( int( dindex1 ) ); + indices.push_back( int( dindex2 ) ); + set_block_entry( block, indices, value ); + // Done with this line, read next + continue; + } + std::stringstream linestr1( line ); + // Look for single index blocks + if( linestr1 >> dindex1 >> value && dindex1 == int( dindex1 ) ) + { + std::vector indices; + indices.push_back( int( dindex1 ) ); + set_block_entry( block, indices, value ); + // Done with this line, read next + continue; + } + } + // Look for block + if( line.find( "block " ) != line.npos ) + { + line = line.substr( 6 ); + // Get rid of spaces between block and block name + while( line[0] == ' ' ) + line = line.substr( 1 ); + // Now find end of block name + size_t space_pos = line.find( ' ' ); + if( space_pos != std::string::npos ) + line = line.substr( 0, space_pos ); + block = line; + continue; + } + // Look for decay + if( line.find( "decay " ) == 0 ) + { + line = line.substr( 6 ); + block = ""; + std::stringstream linestr( line ); + int pdg_code; + double value; + if( linestr >> pdg_code >> value ) + set_block_entry( "decay", pdg_code, value ); + else + std::cout << "Warning: Wrong format for decay block " << line << std::endl; + continue; + } + } + } + if( _blocks.size() == 0 ) + throw "No information read from SLHA card"; + + param_card.close(); +} + +double +SLHAReader::get_block_entry( std::string block_name, std::vector indices, double def_val ) +{ + if( _blocks.find( block_name ) == _blocks.end() ) + { + std::cout << "No such block " << block_name << ", using default value " + << def_val << std::endl; + return def_val; + } + return _blocks[block_name].get_entry( indices ); +} + +double +SLHAReader::get_block_entry( std::string block_name, int index, double def_val ) +{ + std::vector indices; + indices.push_back( index ); + return get_block_entry( block_name, indices, def_val ); +} + +void +SLHAReader::set_block_entry( std::string block_name, std::vector indices, double value ) +{ + if( _blocks.find( block_name ) == _blocks.end() ) + { + SLHABlock block( block_name ); + _blocks[block_name] = block; + } + _blocks[block_name].set_entry( indices, value ); + /* + cout << "Set block " << block_name << " entry "; + for (int i=0;i < indices.size();i++) + cout << indices[i] << " "; + cout << "to " << _blocks[block_name].get_entry(indices) << endl; + */ +} + +void +SLHAReader::set_block_entry( std::string block_name, int index, double value ) +{ + std::vector indices; + indices.push_back( index ); + set_block_entry( block_name, indices, value ); +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.h b/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.h new file mode 100644 index 0000000000..feb8b43b5a --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.h @@ -0,0 +1,41 @@ +#ifndef READ_SLHA_H +#define READ_SLHA_H 1 + +#include +#include +#include +#include + +class SLHABlock +{ +public: + SLHABlock( std::string name = "" ) { _name = name; } + ~SLHABlock() {} + void set_entry( std::vector indices, double value ); + double get_entry( std::vector indices, double def_val = 0 ); + void set_name( std::string name ) { _name = name; } + std::string get_name() { return _name; } + unsigned int get_indices() { return _indices; } +private: + std::string _name; + std::map, double> _entries; + unsigned int _indices; +}; + +class SLHAReader +{ +public: + SLHAReader( std::string file_name = "", bool verbose = true ) + { + if( file_name != "" ) read_slha_file( file_name, verbose ); + } + void read_slha_file( std::string file_name, bool verbose ); + double get_block_entry( std::string block_name, std::vector indices, double def_val = 0 ); + double get_block_entry( std::string block_name, int index, double def_val = 0 ); + void set_block_entry( std::string block_name, std::vector indices, double value ); + void set_block_entry( std::string block_name, int index, double value ); +private: + std::map _blocks; +}; + +#endif From 0fd793b9630667768a303b32112f0c237920e6e5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Apr 2023 13:55:33 +0200 Subject: [PATCH 02/96] [susy2] in susyggtt.sa add constexpr to cxsmmpl::conj function Now valid code? In the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl" --- epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h index caff927311..9b26c48b79 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h @@ -62,7 +62,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); From bd5db8fcee5d9c7940fab36962de40651d37ebc3 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Apr 2023 14:06:20 +0200 Subject: [PATCH 03/96] [susy2] in susyggtt.sa Parameters.h, fix constexpr fixes for Majorana particles in HRDCOD=1 --- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 263590e463..174899c4e0 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -125,11 +125,11 @@ namespace Parameters_MSSM_SLHA2 // keep the same name rather than HardcodedParam constexpr double mdl_Wsu4 = 1.152973e+00; constexpr double mdl_Wsd4 = 2.858123e-01; constexpr double mdl_Wch2 = 2.486895e+00; - constexpr double mdl_Wneu4 = 2.585851e+00; - constexpr double mdl_Wneu3 = 1.915985e+00; + constexpr double mdl_Wneu4_abs = 2.585851e+00; + constexpr double mdl_Wneu3_abs = 1.915985e+00; constexpr double mdl_Wch1 = 1.704145e-02; - constexpr double mdl_Wneu2 = 2.077700e-02; - constexpr double mdl_Wgo = 5.506754e+00; + constexpr double mdl_Wneu2_abs = 2.077700e-02; + constexpr double mdl_Wgo_abs = 5.506754e+00; constexpr double mdl_Wsn3 = 1.475190e-01; constexpr double mdl_Wsl3 = 1.483273e-01; constexpr double mdl_Wsn2 = 1.498816e-01; @@ -733,14 +733,16 @@ namespace Parameters_MSSM_SLHA2 // keep the same name rather than HardcodedParam constexpr double mdl_vu = mdl_vev * mdl_sin__beta; constexpr double mdl_ee__exp__2 = ( ( mdl_ee ) * ( mdl_ee ) ); - if( mdl_Mneu2 < 0 ) - mdl_Wneu2 = -abs( mdl_Wneu2 ); - if( mdl_Mneu3 < 0 ) - mdl_Wneu3 = -abs( mdl_Wneu3 ); - if( mdl_Mneu4 < 0 ) - mdl_Wneu4 = -abs( mdl_Wneu4 ); - if( mdl_Mgo < 0 ) - mdl_Wgo = -abs( mdl_Wgo ); + // Fixes for Majorana particles + constexpr int mdl_Wneu2_sign = ( mdl_Mneu2 < 0 ? -1 : + 1 ); + constexpr int mdl_Wneu3_sign = ( mdl_Mneu3 < 0 ? -1 : + 1 ); + constexpr int mdl_Wneu4_sign = ( mdl_Mneu4 < 0 ? -1 : + 1 ); + constexpr int mdl_Wgo_sign = ( mdl_Mgo < 0 ? -1 : + 1 ); + constexpr double mdl_Wneu2 = mdl_Wneu2_sign * mdl_Wneu2_abs; + constexpr double mdl_Wneu3 = mdl_Wneu3_sign * mdl_Wneu3_abs; + constexpr double mdl_Wneu4 = mdl_Wneu4_sign * mdl_Wneu4_abs; + constexpr double mdl_Wgo = mdl_Wgo_sign * mdl_Wgo_abs; + // Model couplings independent of aS // (none) From 36dfe0591c31b27488c5f140042e88771b259e81 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Apr 2023 14:09:36 +0200 Subject: [PATCH 04/96] [susy2] in susyggtt.sa Parameters.h, fix mdl_G__exp__2 as in SM ggtt.sa (why is this different here??) Replace constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); by const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 174899c4e0..605dd124f9 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -805,7 +805,7 @@ namespace Parameters_MSSM_SLHA2_dependentCouplings // Model parameters dependent on aS //const fptype_sv mdl_sqrt__aS = constexpr_sqrt( aS ); //const fptype_sv G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); + const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); // Model couplings dependent on aS out.GC_6 = -G; out.GC_51 = -( cI * G * mdl_I51x11 ); From 586195dced571514e6f5f903c2e4b4b7fb7fe9af Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Apr 2023 14:13:36 +0200 Subject: [PATCH 05/96] [susy2] in susyggtt.sa Parameters.h, move non-constexper Majorana fixes from printParameters to setParameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HRDCOD=1 build now fails with ccache /usr/local/cuda-12.0/bin/nvcc -O3 -lineinfo -I. -I../../src -I../../../../../tools -I/usr/local/cuda-12.0/include/ -DUSE_NVTX -gencode arch=compute_70,code=compute_70 -gencode arch=compute_70,code=sm_70 -use_fast_math -std=c++17 -ccbin /usr/lib64/ccache/g++ -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -DMGONGPU_HARDCODE_PARAM -Xcompiler -fPIC -c gcheck_sa.cu -o gcheck_sa.o ../../src/Parameters_MSSM_SLHA2.h(310): error: expression must have a constant value ../../src/Parameters_MSSM_SLHA2.h(310): note #2703-D: cannot call non-constexpr function "atan(double) noexcept(true)" /usr/local/cuda-12.0/include/crt/math_functions.h(4137): here ../../src/Parameters_MSSM_SLHA2.h(725): error: expression must have a constant value ../../src/Parameters_MSSM_SLHA2.h(725): note #2703-D: cannot call non-constexpr function "cos(double) noexcept(true)" /usr/local/cuda-12.0/include/crt/math_functions.h(553): here ../../src/Parameters_MSSM_SLHA2.h(726): error: expression must have a constant value ../../src/Parameters_MSSM_SLHA2.h(726): note #2703-D: cannot call non-constexpr function "sin(double) noexcept(true)" /usr/local/cuda-12.0/include/crt/math_functions.h(520): here 3 errors detected in the compilation of "gcheck_sa.cu". The CUDA_HOME=none HRDCOD=0 build also fails with ccache g++ -O3 -std=c++17 -I. -I../../src -I../../../../../tools -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -DMGONGPU_HARDCODE_PARAM -DMGONGPU_HAS_NO_CURAND -fPIC -c CPPProcess.cc -o CPPProcess.o CPPProcess.cc: In function ‘void mg5amcCpu::calculate_wavefunctions(int, const fptype*, const fptype*, mgOnGpu::fptype*, fptype_sv*, int)’: CPPProcess.cc:241:81: error: wrong type argument to unary minus 241 | FFV1_0( w_fp[3], w_fp[2], w_fp[4], -COUPs[1], &_fp[0] ); | ~~~~~~~^ CPPProcess.cc:251:62: error: wrong type argument to unary minus 251 | FFV1_1( w_fp[2], w_fp[0], -COUPs[1], cIPD[0], cIPD[1], w_fp[4] ); | ~~~~~~~^ CPPProcess.cc:254:81: error: wrong type argument to unary minus 254 | FFV1_0( w_fp[3], w_fp[4], w_fp[1], -COUPs[1], &_fp[0] ); | ~~~~~~~^ CPPProcess.cc:263:62: error: wrong type argument to unary minus 263 | FFV1_2( w_fp[3], w_fp[0], -COUPs[1], cIPD[0], cIPD[1], w_fp[4] ); | ~~~~~~~^ CPPProcess.cc:266:81: error: wrong type argument to unary minus 266 | FFV1_0( w_fp[4], w_fp[2], w_fp[1], -COUPs[1], &_fp[0] ); | ~~~~~~~^ --- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc index 38dd9f2ebe..bed37ead65 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc @@ -793,6 +793,14 @@ Parameters_MSSM_SLHA2::setIndependentParameters( SLHAReader& slha ) mdl_vd = mdl_vev * mdl_cos__beta; mdl_vu = mdl_vev * mdl_sin__beta; mdl_ee__exp__2 = ( ( mdl_ee ) * ( mdl_ee ) ); + if( mdl_Mneu2 < 0 ) + mdl_Wneu2 = -abs( mdl_Wneu2 ); + if( mdl_Mneu3 < 0 ) + mdl_Wneu3 = -abs( mdl_Wneu3 ); + if( mdl_Mneu4 < 0 ) + mdl_Wneu4 = -abs( mdl_Wneu4 ); + if( mdl_Mgo < 0 ) + mdl_Wgo = -abs( mdl_Wgo ); } void @@ -1443,14 +1451,6 @@ Parameters_MSSM_SLHA2::printIndependentParameters() std::cout << std::setw( 20 ) << "mdl_vd = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_vd << std::endl; std::cout << std::setw( 20 ) << "mdl_vu = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_vu << std::endl; std::cout << std::setw( 20 ) << "mdl_ee__exp__2 = " << std::setiosflags( std::ios::scientific ) << std::setw( 10 ) << mdl_ee__exp__2 << std::endl; - if( mdl_Mneu2 < 0 ) - mdl_Wneu2 = -abs( mdl_Wneu2 ); - if( mdl_Mneu3 < 0 ) - mdl_Wneu3 = -abs( mdl_Wneu3 ); - if( mdl_Mneu4 < 0 ) - mdl_Wneu4 = -abs( mdl_Wneu4 ); - if( mdl_Mgo < 0 ) - mdl_Wgo = -abs( mdl_Wgo ); } void From 01f5bbc4ce2cecaf60daa5e3a50dcbad5e8c7956 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Apr 2023 14:58:22 +0200 Subject: [PATCH 06/96] [susy2] in CODEGEN, add SUSY process susy_gq_ttllq This is cmd="import model MSSM_SLHA2; define q = u c d s u~ c~ d~ s~; generate g q > t t~ l- l+ q" This is one of the first processes I tested (inspired from Nathan's tests). But in the end many issues are visible in SM gq_ttq and gq_ttllq, and in SUSY gg_tt I keep this commit for the record, but I will revert it --- epochX/cudacpp/CODEGEN/generateAndCompare.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index ea22d6fee0..c645915123 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -43,6 +43,9 @@ function codeGenAndDiff() susy_gg_tt) cmd="import model MSSM_SLHA2; generate g g > t t~" ;; + susy_gq_ttllq) + cmd="import model MSSM_SLHA2; define q = u c d s u~ c~ d~ s~; generate g q > t t~ l- l+ q" + ;; *) echo -e "\nWARNING! Skipping unknown process '$proc'" return From 182ae676cf2e4c70aae28d86f4a99cac06477b3f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 3 Apr 2023 20:43:51 +0200 Subject: [PATCH 07/96] [susy2] in CODEGEN, remove SUSY process susy_gq_ttllq Revert "[susy2] in CODEGEN, add SUSY process susy_gq_ttllq" This reverts commit 01f5bbc4ce2cecaf60daa5e3a50dcbad5e8c7956. --- epochX/cudacpp/CODEGEN/generateAndCompare.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh index c645915123..ea22d6fee0 100755 --- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh +++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh @@ -43,9 +43,6 @@ function codeGenAndDiff() susy_gg_tt) cmd="import model MSSM_SLHA2; generate g g > t t~" ;; - susy_gq_ttllq) - cmd="import model MSSM_SLHA2; define q = u c d s u~ c~ d~ s~; generate g q > t t~ l- l+ q" - ;; *) echo -e "\nWARNING! Skipping unknown process '$proc'" return From e9c7b070bf059551e5e86b15abbcc1b592c3f73d Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 13 Feb 2024 18:40:51 +0100 Subject: [PATCH 08/96] [susy2] regenerate susy_gg_tt.sa with the latest master - will then apply old changes on top 'make HRDCOD=0' fails with ccache /cvmfs/sft.cern.ch/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/bin/g++ -O3 -std=c++17 -I. -fPIC -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -fPIC -c Parameters_MSSM_SLHA2.cc -o Parameters_MSSM_SLHA2.o In file included from Parameters_MSSM_SLHA2.cc:15: Parameters_MSSM_SLHA2.h:26:2: error: #error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1" 26 | #error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1" | ^~~~~ 'make HRDCOD=1' fails with ccache /cvmfs/sft.cern.ch/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/bin/g++ -O3 -std=c++17 -I. -fPIC -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -DMGONGPU_HARDCODE_PARAM -fPIC -c Parameters_MSSM_SLHA2.cc -o Parameters_MSSM_SLHA2.o In file included from Parameters_MSSM_SLHA2.cc:15: Parameters_MSSM_SLHA2.h:403:53: error: call to non-'constexpr' function 'mgOnGpu::cxsmpl mgOnGpu::conj(const cxsmpl&) [with FP = double]' 403 | constexpr cxsmpl mdl_conjg__yu3x3 = conj( mdl_yu3x3 ); | ~~~~^~~~~~~~~~~~~ --- epochX/cudacpp/susy_gg_tt.sa/.clang-format | 5 +- .../cudacpp/susy_gg_tt.sa/CMake/Compilers.txt | 5 + epochX/cudacpp/susy_gg_tt.sa/CMake/Macros.txt | 5 + .../cudacpp/susy_gg_tt.sa/CMake/Platforms.txt | 5 + epochX/cudacpp/susy_gg_tt.sa/CMakeLists.txt | 5 + .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 151 +- .../susy_gg_tt.sa/SubProcesses/Bridge.h | 141 +- .../SubProcesses/BridgeKernels.cc | 18 +- .../SubProcesses/BridgeKernels.h | 11 +- .../susy_gg_tt.sa/SubProcesses/CMakeLists.txt | 5 + .../SubProcesses/CrossSectionKernels.cc | 10 +- .../SubProcesses/CrossSectionKernels.h | 9 +- .../susy_gg_tt.sa/SubProcesses/CudaRuntime.h | 80 - .../SubProcesses/EventStatistics.h | 13 +- .../susy_gg_tt.sa/SubProcesses/MadgraphTest.h | 55 +- .../SubProcesses/MatrixElementKernels.cc | 42 +- .../SubProcesses/MatrixElementKernels.h | 11 +- .../SubProcesses/MemoryAccessAmplitudes.h | 232 +- .../SubProcesses/MemoryAccessCouplings.h | 472 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 112 +- .../SubProcesses/MemoryAccessDenominators.h | 26 +- .../SubProcesses/MemoryAccessGs.h | 262 +- .../SubProcesses/MemoryAccessHelpers.h | 7 +- .../SubProcesses/MemoryAccessMatrixElements.h | 234 +- .../SubProcesses/MemoryAccessMomenta.h | 449 +- .../SubProcesses/MemoryAccessNumerators.h | 26 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 16 +- .../SubProcesses/MemoryAccessVectors.h | 7 +- .../SubProcesses/MemoryAccessWavefunctions.h | 242 +- .../SubProcesses/MemoryAccessWeights.h | 5 + .../SubProcesses/MemoryBuffers.h | 79 +- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CMakeLists.txt | 5 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 291 +- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h | 46 +- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CudaRuntime.h | 1 - .../RandomNumberKernels.cc | 1 - .../P1_Sigma_MSSM_SLHA2_gg_ttx/check_sa.cc | 301 +- .../epoch_process_id.h | 7 +- .../P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f | 11 +- .../gBridgeKernels.cu | 1 - .../P1_Sigma_MSSM_SLHA2_gg_ttx/gCPPProcess.cu | 1 - .../gCrossSectionKernels.cu | 1 - .../gMatrixElementKernels.cu | 1 - .../gRamboSamplingKernels.cu | 1 - .../gRandomNumberKernels.cu | 1 - .../P1_Sigma_MSSM_SLHA2_gg_ttx/gcheck_sa.cu | 1 - .../SubProcesses/RamboSamplingKernels.cc | 23 +- .../SubProcesses/RamboSamplingKernels.h | 9 +- .../SubProcesses/RandomNumberKernels.cc | 149 - .../SubProcesses/RandomNumberKernels.h | 63 +- .../susy_gg_tt.sa/SubProcesses/cudacpp.mk | 534 +- .../susy_gg_tt.sa/SubProcesses/fbridge.cc | 54 +- .../susy_gg_tt.sa/SubProcesses/fbridge.inc | 32 +- .../susy_gg_tt.sa/SubProcesses/fsampler.cc | 16 +- .../susy_gg_tt.sa/SubProcesses/fsampler.inc | 5 + .../cudacpp/susy_gg_tt.sa/SubProcesses/nvtx.h | 5 + .../SubProcesses/ompnumthreads.h | 5 + .../susy_gg_tt.sa/SubProcesses/perf.py | 5 + .../susy_gg_tt.sa/SubProcesses/profile.sh | 5 + .../susy_gg_tt.sa/SubProcesses/runTest.cc | 36 +- .../susy_gg_tt.sa/SubProcesses/testmisc.cc | 115 +- .../susy_gg_tt.sa/SubProcesses/testxxx.cc | 174 +- .../SubProcesses/testxxx_cc_ref.txt | 5398 +++++++++++------ .../susy_gg_tt.sa/SubProcesses/timer.h | 5 + .../susy_gg_tt.sa/SubProcesses/timermap.h | 5 + .../cudacpp/susy_gg_tt.sa/src/CMakeLists.txt | 5 + .../susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h | 130 +- .../src/Parameters_MSSM_SLHA2.cc | 15 +- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 1602 ++--- .../cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk | 75 +- .../cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h | 123 +- .../susy_gg_tt.sa/src/mgOnGpuCxtypes.h | 861 +-- .../susy_gg_tt.sa/src/mgOnGpuFptypes.h | 142 +- .../susy_gg_tt.sa/src/mgOnGpuVectors.h | 1162 ++-- epochX/cudacpp/susy_gg_tt.sa/src/rambo.h | 25 +- epochX/cudacpp/susy_gg_tt.sa/src/read_slha.cc | 22 +- epochX/cudacpp/susy_gg_tt.sa/src/read_slha.h | 9 + 77 files changed, 8709 insertions(+), 5510 deletions(-) delete mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CudaRuntime.h delete mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CudaRuntime.h delete mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.cc delete mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gBridgeKernels.cu delete mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCPPProcess.cu delete mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCrossSectionKernels.cu delete mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gMatrixElementKernels.cu delete mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRamboSamplingKernels.cu delete mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRandomNumberKernels.cu delete mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gcheck_sa.cu delete mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.cc mode change 100755 => 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/profile.sh diff --git a/epochX/cudacpp/susy_gg_tt.sa/.clang-format b/epochX/cudacpp/susy_gg_tt.sa/.clang-format index 12afd69b12..ecdf235089 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/.clang-format +++ b/epochX/cudacpp/susy_gg_tt.sa/.clang-format @@ -1,4 +1,7 @@ -# AV's draft .clang-format +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. # --- # February 2022: latest draft for clang 13.0.0 (BasedOnStyle: Google) # See https://releases.llvm.org/13.0.0/tools/clang/docs/ClangFormatStyleOptions.html diff --git a/epochX/cudacpp/susy_gg_tt.sa/CMake/Compilers.txt b/epochX/cudacpp/susy_gg_tt.sa/CMake/Compilers.txt index eec4baed28..ed95d782ab 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CMake/Compilers.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CMake/Compilers.txt @@ -1,2 +1,7 @@ +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Roiser (2022-2023) for the MG5aMC CUDACPP plugin. + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED True) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CMake/Macros.txt b/epochX/cudacpp/susy_gg_tt.sa/CMake/Macros.txt index 9a0e141b81..59565e07de 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CMake/Macros.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CMake/Macros.txt @@ -1,3 +1,8 @@ +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Roiser (2022-2023) for the MG5aMC CUDACPP plugin. + MACRO(SUBDIRLIST result) FILE(GLOB children RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/*) SET(dirlist "") diff --git a/epochX/cudacpp/susy_gg_tt.sa/CMake/Platforms.txt b/epochX/cudacpp/susy_gg_tt.sa/CMake/Platforms.txt index ab73e53db8..ed8141c7ef 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CMake/Platforms.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CMake/Platforms.txt @@ -1,3 +1,8 @@ +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Roiser (2022-2023) for the MG5aMC CUDACPP plugin. + if (CMAKE_HOST_APPLE) add_definitions(-DMGONGPU_HAS_NO_CURAND) endif(CMAKE_HOST_APPLE) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CMakeLists.txt b/epochX/cudacpp/susy_gg_tt.sa/CMakeLists.txt index d3010411fc..289793b669 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CMakeLists.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CMakeLists.txt @@ -1,3 +1,8 @@ +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Roiser (2022-2023) for the MG5aMC CUDACPP plugin. + # Minimal CMake configuration to build a functional CPU version cmake_minimum_required(VERSION 3.22) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index f4d9395bb9..118f88ad3e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -14,14 +14,14 @@ Running MG5 in debug mode * * * * * * * * * * * * -* VERSION 3.5.0_lo_vect 2023-01-26 * +* VERSION 3.5.3_lo_vect 2023-12-23 * * * * WARNING: UNKNOWN DEVELOPMENT VERSION. * * WARNING: DO NOT USE FOR PRODUCTION * * * * * * The MadGraph5_aMC@NLO Development Team - Find us at * -* https://server06.fynu.ucl.ac.be/projects/madgraph * +* http://madgraph.phys.ucl.ac.be/ * * and * * http://amcatnlo.web.cern.ch/amcatnlo/ * * * @@ -51,14 +51,17 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs MG5_aMC> set lhapdf /PATH/TO/lhapdf-config Using default text editor "vi". Set another one in ./input/mg5_configuration.txt -No valid eps viewer found. Please set in ./input/mg5_configuration.txt +Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt -import /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt.mg +import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F -import model MSSM_SLHA2; generate g g > t t~ +import model MSSM_SLHA2 +INFO: load particles +INFO: load vertices +DEBUG: model prefixing takes 0.9187817573547363  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -77,7 +80,7 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: remove interactions: a0 su3 su3 at order: QED=1  DEBUG: remove interactions: a0 su6 su6 at order: QED=1  DEBUG: Fuse the Following coupling (they have the same value): ('GC_106', 1), ('GC_107', 1), ('GC_110', -1), ('GC_111', -1), ('GC_114', 1), ('GC_115', 1), ('GC_118', -1), ('GC_119', -1), ('GC_498', 1), ('GC_503', 1), ('GC_518', -1), ('GC_523', -1), ('GC_582', 1), ('GC_587', 1), ('GC_602', -1), ('GC_607', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_123', 1), ('GC_125', 1), ('GC_126', 1), ('GC_131', 1), ('GC_132', 1), ('GC_137', 1), ('GC_138', 1), ('GC_836', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_123', 1), ('GC_125', 1), ('GC_126', 1), ('GC_131', 1), ('GC_132', 1), ('GC_137', 1), ('GC_138', 1), ('GC_836', 1), ('GC_938', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_128', 1), ('GC_129', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_10', 1), ('GC_13', 1), ('GC_22', 1), ('GC_25', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_134', 1), ('GC_135', 1)  @@ -136,7 +139,7 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_267', 1), ('GC_268', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_269', 1), ('GC_270', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_271', 1), ('GC_272', 1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_273', 1), ('GC_274', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_273', 1), ('GC_274', 1), ('GC_946', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_276', 1), ('GC_277', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_19', 1), ('GC_28', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_279', 1), ('GC_281', 1)  @@ -151,7 +154,7 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_298', 1), ('GC_300', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_303', 1), ('GC_304', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_306', 1), ('GC_307', -1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_309', 1), ('GC_310', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_309', 1), ('GC_310', 1), ('GC_945', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_312', 1), ('GC_313', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_315', 1), ('GC_316', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_318', 1), ('GC_319', -1)  @@ -171,7 +174,7 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_408', 1), ('GC_409', 1), ('GC_538', 1), ('GC_542', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_410', 1), ('GC_411', 1), ('GC_412', 1), ('GC_570', 1), ('GC_574', 1), ('GC_578', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_413', 1), ('GC_414', 1), ('GC_583', 1), ('GC_588', 1)  -DEBUG: Fuse the Following coupling (they have the same value): ('GC_42', 1), ('GC_43', 1), ('GC_46', 1), ('GC_47', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_42', 1), ('GC_43', 1), ('GC_46', 1), ('GC_47', 1), ('GC_937', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_429', 1), ('GC_430', 1), ('GC_500', 1), ('GC_505', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_431', 1), ('GC_432', 1), ('GC_539', 1), ('GC_543', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_433', 1), ('GC_434', 1), ('GC_435', 1), ('GC_571', 1), ('GC_575', 1), ('GC_579', 1)  @@ -182,16 +185,20 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_456', 1), ('GC_457', 1), ('GC_458', 1), ('GC_572', 1), ('GC_576', 1), ('GC_580', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_459', 1), ('GC_460', 1), ('GC_585', 1), ('GC_590', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_423', 1), ('GC_467', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_446', 1), ('GC_468', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_475', 1), ('GC_476', 1), ('GC_502', 1), ('GC_507', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_477', 1), ('GC_478', 1), ('GC_541', 1), ('GC_545', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_45', 1), ('GC_48', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_479', 1), ('GC_480', 1), ('GC_481', 1), ('GC_573', 1), ('GC_577', 1), ('GC_581', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_482', 1), ('GC_483', 1), ('GC_586', 1), ('GC_591', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_424', 1), ('GC_490', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_447', 1), ('GC_491', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_470', 1), ('GC_492', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_2', 1), ('GC_50', -1), ('GC_52', -1), ('GC_58', -1), ('GC_60', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_108', 1), ('GC_508', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_415', 1), ('GC_509', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_438', 1), ('GC_510', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_461', 1), ('GC_511', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_484', 1), ('GC_512', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_109', 1), ('GC_513', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_339', 1), ('GC_514', 1)  @@ -201,6 +208,7 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_112', 1), ('GC_528', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_416', 1), ('GC_529', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_51', 1), ('GC_53', 1), ('GC_59', 1), ('GC_61', 1), ('GC_67', 1), ('GC_69', 1), ('GC_7', -1), ('GC_75', 1), ('GC_77', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_439', 1), ('GC_530', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_462', 1), ('GC_531', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_485', 1), ('GC_532', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_113', 1), ('GC_533', 1)  @@ -211,10 +219,13 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_417', 1), ('GC_546', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_440', 1), ('GC_547', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_463', 1), ('GC_548', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_486', 1), ('GC_549', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_341', 1), ('GC_550', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_347', 1), ('GC_551', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_353', 1), ('GC_552', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_359', 1), ('GC_553', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_418', 1), ('GC_562', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_441', 1), ('GC_563', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_464', 1), ('GC_564', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_487', 1), ('GC_565', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_342', 1), ('GC_566', 1)  @@ -232,11 +243,13 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_355', 1), ('GC_600', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_361', 1), ('GC_601', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_120', 1), ('GC_612', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_426', 1), ('GC_613', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_449', 1), ('GC_614', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_472', 1), ('GC_615', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_495', 1), ('GC_616', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_121', 1), ('GC_617', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_344', 1), ('GC_618', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_350', 1), ('GC_619', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_56', 1), ('GC_62', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_356', 1), ('GC_620', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_362', 1), ('GC_621', 1)  @@ -251,6 +264,7 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_419', 1), ('GC_633', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_442', 1), ('GC_634', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_465', 1), ('GC_635', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_488', 1), ('GC_636', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_378', 1), ('GC_645', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_379', 1), ('GC_646', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_380', 1), ('GC_647', 1)  @@ -277,7 +291,10 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_401', 1), ('GC_684', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_402', 1), ('GC_685', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_428', 1), ('GC_686', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_451', 1), ('GC_687', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_474', 1), ('GC_688', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_497', 1), ('GC_689', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_676', 1), ('GC_691', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_694', 1), ('GC_697', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_695', 1), ('GC_698', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_696', 1), ('GC_699', 1)  @@ -285,9 +302,11 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_702', 1), ('GC_703', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_704', 1), ('GC_712', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_705', 1), ('GC_713', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_706', 1), ('GC_714', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_707', 1), ('GC_715', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_708', 1), ('GC_716', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_709', 1), ('GC_717', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_710', 1), ('GC_718', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_711', 1), ('GC_719', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_720', 1), ('GC_723', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_721', 1), ('GC_724', 1)  @@ -299,6 +318,7 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_757', 1), ('GC_758', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_759', 1), ('GC_760', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_762', 1), ('GC_763', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_766', 1), ('GC_767', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_770', 1), ('GC_771', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_772', 1), ('GC_773', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_774', 1), ('GC_775', 1)  @@ -306,15 +326,23 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_72', 1), ('GC_78', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_780', 1), ('GC_781', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_782', 1), ('GC_783', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_785', 1), ('GC_786', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_73', 1), ('GC_79', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_797', 1), ('GC_798', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_6', 1), ('GC_8', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_801', 1), ('GC_802', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_805', 1), ('GC_806', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_726', 1), ('GC_808', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_727', 1), ('GC_809', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_728', 1), ('GC_810', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_729', 1), ('GC_811', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_730', 1), ('GC_812', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_731', 1), ('GC_813', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_732', 1), ('GC_814', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_733', 1), ('GC_815', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_734', 1), ('GC_816', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_735', 1), ('GC_817', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_736', 1), ('GC_818', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_737', 1), ('GC_819', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_738', 1), ('GC_820', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_739', 1), ('GC_821', 1)  @@ -322,6 +350,7 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_741', 1), ('GC_823', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_742', 1), ('GC_824', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_743', 1), ('GC_825', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_744', 1), ('GC_826', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_745', 1), ('GC_827', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_788', 1), ('GC_828', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_789', 1), ('GC_829', 1)  @@ -331,11 +360,13 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_793', 1), ('GC_833', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_794', 1), ('GC_834', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_795', 1), ('GC_835', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_245', 1), ('GC_837', 1), ('GC_944', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_842', 1), ('GC_844', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_843', 1), ('GC_845', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_846', 1), ('GC_848', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_847', 1), ('GC_849', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_82', 1), ('GC_85', 1), ('GC_94', 1), ('GC_97', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_851', 1), ('GC_852', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_855', 1), ('GC_856', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_83', 1), ('GC_86', 1), ('GC_95', 1), ('GC_98', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_876', 1), ('GC_877', 1), ('GC_880', 1), ('GC_881', 1), ('GC_884', 1), ('GC_885', 1), ('GC_886', 1), ('GC_887', 1)  @@ -343,6 +374,7 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_879', 1), ('GC_883', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_890', 1), ('GC_898', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_891', 1), ('GC_899', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_892', 1), ('GC_900', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_893', 1), ('GC_901', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_903', 1), ('GC_904', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_858', 1), ('GC_906', -1)  @@ -350,7 +382,9 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_860', 1), ('GC_908', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_861', 1), ('GC_909', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_100', 1), ('GC_91', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_862', 1), ('GC_910', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_863', 1), ('GC_911', -1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_864', 1), ('GC_912', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_865', 1), ('GC_913', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_866', 1), ('GC_914', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_867', 1), ('GC_915', -1)  @@ -362,6 +396,7 @@ INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Fuse the Following coupling (they have the same value): ('GC_872', 1), ('GC_920', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_873', 1), ('GC_921', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_874', 1), ('GC_922', 1)  +DEBUG: Fuse the Following coupling (they have the same value): ('GC_875', 1), ('GC_923', 1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_894', 1), ('GC_924', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_895', 1), ('GC_925', -1)  DEBUG: Fuse the Following coupling (they have the same value): ('GC_896', 1), ('GC_926', -1)  @@ -516,92 +551,56 @@ Defined multiparticle l- = e- mu- Defined multiparticle vl = ve vm vt Defined multiparticle vl~ = ve~ vm~ vt~ Defined multiparticle all = g u c d s u~ c~ d~ s~ a ve vm vt e- mu- ve~ vm~ vt~ e+ mu+ go ul cl t1 ur cr t2 dl sl b1 dr sr b2 ul~ cl~ t1~ ur~ cr~ t2~ dl~ sl~ b1~ dr~ sr~ b2~ t b t~ b~ z w+ h01 h2 h3 h+ sve svm svt el- mul- ta1- er- mur- ta2- w- h- sve~ svm~ svt~ el+ mul+ ta1+ er+ mur+ ta2+ n1 n2 n3 n4 x1+ x2+ ta- x1- x2- ta+ +generate g g > t t~ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.112 s +1 processes with 3 diagrams generated in 0.110 s Total: 1 processes with 3 diagrams -output standalone_cudacpp CODEGEN_cudacpp_susy_gg_tt -Load PLUGIN.CUDACPP_SA_OUTPUT -Output will be done with PLUGIN: CUDACPP_SA_OUTPUT +output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt +Load PLUGIN.CUDACPP_OUTPUT +Plugin PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.5.3_lo_vect. +It has been validated for the last time with version: 3.5.2 +Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 143]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 148]  -INFO: Creating subdirectories in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 173]  -DEBUG: type(subproc_group)= [output.py at line 174]  -DEBUG: type(fortran_model)= [output.py at line 175]  -DEBUG: type(me)= me=0 [output.py at line 176]  -INFO: Creating files in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx -DEBUG: Entering PLUGIN_OneProcessExporter.generate_process_files [model_handling.py at line 1246]  -DEBUG: self.include_multi_channel is not yet defined: this is standalone_cudacpp mode [model_handling.py at line 1250]  -FileWriter for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h -DEBUG: Entering PLUGIN_OneProcessExporter.write_process_h_file [model_handling.py at line 1389]  -FileWriter for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc -DEBUG: Entering PLUGIN_OneProcessExporter.write_process_cc_file [model_handling.py at line 1411]  -DEBUG: Entering PLUGIN_OneProcessExporter.get_sigmaKin_lines [model_handling.py at line 1107]  -DEBUG: self.include_multi_channel =  False [model_handling.py at line 1108]  -DEBUG: self.support_multichannel =  True [model_handling.py at line 1109]  -DEBUG: type(self.helas_call_writer) =  [model_handling.py at line 1203]  -DEBUG: self.support_multichannel, self.include_multi_channel =  True False [model_handling.py at line 1204]  -DEBUG: multi_channel_map =  None [model_handling.py at line 1590]  -DEBUG: diag_to_config =  {} [model_handling.py at line 1645]  -DEBUG: call =  vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [model_handling.py at line 1757]  -DEBUG: ('ZERO', 0, -1, 0, 0) [model_handling.py at line 1758]  -DEBUG: call =  vxxxxx( momenta,m_pars->%s, cHel[ihel][%d],%+d, w_sv[%d], %d ); [model_handling.py at line 1757]  -DEBUG: ('ZERO', 1, -1, 1, 1) [model_handling.py at line 1758]  -INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. -DEBUG: Entering PLUGIN_OneProcessExporter.edit_CMakeLists [model_handling.py at line 1279]  -DEBUG: Entering PLUGIN_OneProcessExporter.edit_check_sa [model_handling.py at line 1288]  -DEBUG: Entering PLUGIN_OneProcessExporter.edit_mgonGPU [model_handling.py at line 1305]  -DEBUG: Entering PLUGIN_OneProcessExporter.edit_processidfile [model_handling.py at line 1325]  -DEBUG: Entering PLUGIN_OneProcessExporter.edit_testxxx [model_handling.py at line 1355]  -DEBUG: Entering PLUGIN_OneProcessExporter.edit_memorybuffers [model_handling.py at line 1366]  -DEBUG: Entering PLUGIN_OneProcessExporter.edit_memoryaccesscouplings [model_handling.py at line 1377]  -Generated helas calls for 1 subprocesses (3 diagrams) in 0.005 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 181]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  +DEBUG: type(subproc_group)= [output.py at line 195]  +DEBUG: type(fortran_model)= [output.py at line 196]  +DEBUG: type(me)= me=0 [output.py at line 197]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 198]  +INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx +FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h +FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc +INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. +Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.126 s +ALOHA: aloha creates 2 routines in 0.140 s VVV1 FFV1 FFV1 FFV1 -FileWriter for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h -INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/. +FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h +INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. super_write_set_parameters_onlyfixMajorana (hardcoded=False) -DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 617 , keys size = 617 [model_handling.py at line 716]  -DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 617 [model_handling.py at line 732]  -DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 617 , keys size = 617 [model_handling.py at line 733]  super_write_set_parameters_onlyfixMajorana (hardcoded=True) -DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 3 , keys size = 3 [model_handling.py at line 716]  -DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 3 [model_handling.py at line 732]  -DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 3 , keys size = 3 [model_handling.py at line 733]  -DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 2 , keys size = 2 [model_handling.py at line 716]  -DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 2 [model_handling.py at line 732]  -DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 2 , keys size = 2 [model_handling.py at line 733]  -DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 3 , keys size = 3 [model_handling.py at line 716]  -DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 3 [model_handling.py at line 732]  -DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 3 , keys size = 3 [model_handling.py at line 733]  -DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 2 , keys size = 2 [model_handling.py at line 716]  -DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 2 [model_handling.py at line 732]  -DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 2 , keys size = 2 [model_handling.py at line 733]  -DEBUG: 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) =  pardef_lines size = 2 , keys size = 2 [model_handling.py at line 716]  -DEBUG: 'parset_pars size =', len(parset_pars) =  parset_pars size = 2 [model_handling.py at line 732]  -DEBUG: 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) =  parset_lines size = 2 , keys size = 2 [model_handling.py at line 733]  -FileWriter for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h -FileWriter for /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc +FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h +FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.cc INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in directory -INFO: /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/MG5aMC/ghav-mg5amcnlo/CODEGEN_cudacpp_susy_gg_tt/src/. -DEBUG: Entering PLUGIN_ProcessExporter.finalize [output.py at line 190]  +INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.264s -user 0m1.200s -sys 0m0.057s +real 0m2.624s +user 0m2.440s +sys 0m0.084s +Code generation completed in 3 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h index faa8f95d1d..f9ed70dfde 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/Bridge.h @@ -1,8 +1,12 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #ifndef BRIDGE_H #define BRIDGE_H 1 -// Includes from Cuda/C++ matrix element calculations -#include "mgOnGpuConfig.h" // for mgOnGpu::npar, mgOnGpu::np4 +#include "mgOnGpuConfig.h" #include "CPPProcess.h" // for CPPProcess #include "CrossSectionKernels.h" // for flagAbnormalMEs @@ -10,6 +14,14 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include @@ -18,7 +30,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -78,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -139,41 +151,41 @@ namespace mg5amcCpu int nGoodHel() const { return m_nGoodHel; } // Return the total number of helicities (expose cudacpp ncomb in the Bridge interface to Fortran) - constexpr int nTotHel() const { return mgOnGpu::ncomb; } + constexpr int nTotHel() const { return CPPProcess::ncomb; } private: unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) - mg5amcGpu::DeviceBuffer m_devMomentaF; - mg5amcGpu::DeviceBufferMomenta m_devMomentaC; - mg5amcGpu::DeviceBufferGs m_devGs; - mg5amcGpu::DeviceBufferRndNumHelicity m_devRndHel; - mg5amcGpu::DeviceBufferRndNumColor m_devRndCol; - mg5amcGpu::DeviceBufferMatrixElements m_devMEs; - mg5amcGpu::DeviceBufferSelectedHelicity m_devSelHel; - mg5amcGpu::DeviceBufferSelectedColor m_devSelCol; - mg5amcGpu::PinnedHostBufferGs m_hstGs; - mg5amcGpu::PinnedHostBufferRndNumHelicity m_hstRndHel; - mg5amcGpu::PinnedHostBufferRndNumColor m_hstRndCol; - mg5amcGpu::PinnedHostBufferMatrixElements m_hstMEs; - mg5amcGpu::PinnedHostBufferSelectedHelicity m_hstSelHel; - mg5amcGpu::PinnedHostBufferSelectedColor m_hstSelCol; - std::unique_ptr m_pmek; + DeviceBuffer m_devMomentaF; + DeviceBufferMomenta m_devMomentaC; + DeviceBufferGs m_devGs; + DeviceBufferRndNumHelicity m_devRndHel; + DeviceBufferRndNumColor m_devRndCol; + DeviceBufferMatrixElements m_devMEs; + DeviceBufferSelectedHelicity m_devSelHel; + DeviceBufferSelectedColor m_devSelCol; + PinnedHostBufferGs m_hstGs; + PinnedHostBufferRndNumHelicity m_hstRndHel; + PinnedHostBufferRndNumColor m_hstRndCol; + PinnedHostBufferMatrixElements m_hstMEs; + PinnedHostBufferSelectedHelicity m_hstSelHel; + PinnedHostBufferSelectedColor m_hstSelCol; + std::unique_ptr m_pmek; //static constexpr int s_gputhreadsmin = 16; // minimum number of gpu threads (TEST VALUE FOR MADEVENT) static constexpr int s_gputhreadsmin = 32; // minimum number of gpu threads (DEFAULT) #else - mg5amcCpu::HostBufferMomenta m_hstMomentaC; - mg5amcCpu::HostBufferGs m_hstGs; - mg5amcCpu::HostBufferRndNumHelicity m_hstRndHel; - mg5amcCpu::HostBufferRndNumColor m_hstRndCol; - mg5amcCpu::HostBufferMatrixElements m_hstMEs; - mg5amcCpu::HostBufferSelectedHelicity m_hstSelHel; - mg5amcCpu::HostBufferSelectedColor m_hstSelCol; - std::unique_ptr m_pmek; + HostBufferMomenta m_hstMomentaC; + HostBufferGs m_hstGs; + HostBufferRndNumHelicity m_hstRndHel; + HostBufferRndNumColor m_hstRndCol; + HostBufferMatrixElements m_hstMEs; + HostBufferSelectedHelicity m_hstSelHel; + HostBufferSelectedColor m_hstSelCol; + std::unique_ptr m_pmek; #endif }; @@ -182,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -204,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -226,9 +238,9 @@ namespace mg5amcCpu , m_hstSelCol( m_nevt ) , m_pmek( nullptr ) { - if( nparF != mgOnGpu::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); - if( np4F != mgOnGpu::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ + if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -240,17 +252,32 @@ namespace mg5amcCpu } std::cout << "WARNING! Instantiate device Bridge (nevt=" << m_nevt << ", gpublocks=" << m_gpublocks << ", gputhreads=" << m_gputhreads << ", gpublocks*gputhreads=" << m_gpublocks * m_gputhreads << ")" << std::endl; - mg5amcGpu::CPPProcess process( /*verbose=*/false ); - m_pmek.reset( new mg5amcGpu::MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); + m_pmek.reset( new MatrixElementKernelDevice( m_devMomentaC, m_devGs, m_devRndHel, m_devRndCol, m_devMEs, m_devSelHel, m_devSelCol, m_gpublocks, m_gputhreads ) ); #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; - mg5amcCpu::CPPProcess process( /*verbose=*/false ); - m_pmek.reset( new mg5amcCpu::MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ - process.initProc( "../../Cards/param_card.dat" ); + m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); +#endif // MGONGPUCPP_GPUIMPL + // Create a process object, read param card and set parameters + // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? + CPPProcess process( /*verbose=*/false ); + std::string paramCard = "../../Cards/param_card.dat"; + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 + process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -264,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -279,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); - const int thrPerEvt = mgOnGpu::npar * mgOnGpu::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); + const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -329,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -384,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { @@ -392,8 +419,8 @@ namespace mg5amcCpu if constexpr( oldImplementation ) { // SR initial implementation - constexpr int part = mgOnGpu::npar; - constexpr int mome = mgOnGpu::np4; + constexpr int part = CPPProcess::npar; + constexpr int mome = CPPProcess::np4; constexpr int strd = MemoryAccessMomenta::neppM; int pos = blockDim.x * blockIdx.x + threadIdx.x; int arrlen = nevt * part * mome; @@ -418,8 +445,8 @@ namespace mg5amcCpu // AV attempt another implementation with 1 event per thread: this seems slower... // F-style: AOS[nevtF][nparF][np4F] // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM - constexpr int npar = mgOnGpu::npar; - constexpr int np4 = mgOnGpu::np4; + constexpr int npar = CPPProcess::npar; + constexpr int np4 = CPPProcess::np4; constexpr int neppM = MemoryAccessMomenta::neppM; assert( nevt % neppM == 0 ); // number of events is not a multiple of neppM??? int ievt = blockDim.x * blockIdx.x + threadIdx.x; @@ -443,8 +470,8 @@ namespace mg5amcCpu if constexpr( oldImplementation ) { // SR initial implementation - constexpr unsigned int part = mgOnGpu::npar; - constexpr unsigned int mome = mgOnGpu::np4; + constexpr unsigned int part = CPPProcess::npar; + constexpr unsigned int mome = CPPProcess::np4; constexpr unsigned int strd = MemoryAccessMomenta::neppM; unsigned int arrlen = nevt * part * mome; for( unsigned int pos = 0; pos < arrlen; ++pos ) @@ -472,8 +499,8 @@ namespace mg5amcCpu // [NB! this is not a transposition, it is an AOS to AOSOA conversion: if neppM=1, a memcpy is enough] // F-style: AOS[nevtF][nparF][np4F] // C-style: AOSOA[npagM][npar][np4][neppM] with nevt=npagM*neppM - constexpr unsigned int npar = mgOnGpu::npar; - constexpr unsigned int np4 = mgOnGpu::np4; + constexpr unsigned int npar = CPPProcess::npar; + constexpr unsigned int np4 = CPPProcess::np4; constexpr unsigned int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc index c2c16ff038..eaf4037a24 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.cc @@ -1,20 +1,26 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include -using mgOnGpu::npar; // the number of particles (external = initial + final) -using mgOnGpu::np4; // the number of dimensions of 4-momenta (E,px,py,pz) - //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { + constexpr int np4 = CPPProcess::np4; // dimensions of 4-momenta (E,px,py,pz) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + //-------------------------------------------------------------------------- BridgeKernelBase::BridgeKernelBase( const BufferMomenta& momenta, // input: momenta @@ -40,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -91,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.h index 10e664a4c4..3efef8ce97 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/BridgeKernels.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -7,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -44,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -84,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CMakeLists.txt b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CMakeLists.txt index 1e15f3e9ed..256907bd62 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CMakeLists.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CMakeLists.txt @@ -1,3 +1,8 @@ +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Roiser (2022-2023) for the MG5aMC CUDACPP plugin. + SUBDIRLIST(SUBDIRS) FOREACH(subdir ${SUBDIRS}) ADD_SUBDIRECTORY(${subdir}) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.cc index 398f8a87bd..c15b39844d 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.cc @@ -1,5 +1,11 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -72,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -180,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.h index 6098157b4e..4d9659e04e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CrossSectionKernels.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -8,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -91,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index e16ed2c703..0000000000 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,80 +0,0 @@ -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/EventStatistics.h index 19c5199bcc..b425a5bade 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,14 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef EventStatistics_H #define EventStatistics_H 1 -#include "mgOnGpuConfig.h" // for npar (meGeVexponent) +#include "mgOnGpuConfig.h" + +#include "CPPProcess.h" // for npar (meGeVexponent) #include #include @@ -9,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -127,7 +134,7 @@ namespace mg5amcCpu void printout( std::ostream& out ) const { const EventStatistics& s = *this; - constexpr int meGeVexponent = -( 2 * mgOnGpu::npar - 8 ); + constexpr int meGeVexponent = -( 2 * CPPProcess::npar - 8 ); out << s.tag << "NumMatrixElems(notAbnormal) = " << s.nevtOK() << std::endl << std::scientific // fixed format: affects all floats (default precision: 6) << s.tag << "MeanMatrixElemValue = ( " << s.meanME() diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MadgraphTest.h index 2a0be47978..6054185300 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MadgraphTest.h @@ -1,12 +1,24 @@ -// Stephan Hageboeck, CERN, 12/2020 +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 +#include "mgOnGpuConfig.h" + +#include "CPPProcess.h" + #include -#include #include #include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -14,18 +26,25 @@ #include #include +#ifdef MGONGPUCPP_GPUIMPL +using mg5amcGpu::CPPProcess; +#else +using mg5amcCpu::CPPProcess; +#endif + namespace { struct ReferenceData { - std::vector>> momenta; + std::vector>> momenta; std::vector MEs; }; /// Read batches of reference data from a file and store them in a map. std::map readReferenceData( const std::string& refFileName ) { + std::cout << "INFO: Opening reference file " << refFileName << std::endl; std::ifstream referenceFile( refFileName.c_str() ); EXPECT_TRUE( referenceFile.is_open() ) << refFileName; std::map referenceData; @@ -186,34 +205,38 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, /// and compares momenta and matrix elements with a reference file. TEST_P( MadgraphTest, CompareMomentaAndME ) { - // Set to true to dump events: - constexpr bool dumpEvents = false; - constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) - const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 3.E-2; + const fptype toleranceMomenta = std::is_same::value ? 1.E-10 : 4.E-2; // see #735 #ifdef __APPLE__ const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 3.E-2; // see #583 #else const fptype toleranceMEs = std::is_same::value ? 1.E-6 : 2.E-3; #endif - std::string dumpFileName = std::string( "dump_" ) + testing::UnitTest::GetInstance()->current_test_info()->test_suite_name() + '.' + testing::UnitTest::GetInstance()->current_test_info()->name() + ".txt"; - while( dumpFileName.find( '/' ) != std::string::npos ) - { - dumpFileName.replace( dumpFileName.find( '/' ), 1, "_" ); - } + constexpr fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) + // Dump events to a new reference file? + const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); + const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); + const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else + const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { dumpFile.open( dumpFileName, std::ios::trunc ); } // Read reference data - const std::string refFileName = testDriver->getRefFileName(); std::map referenceData; if( !dumpEvents ) { @@ -268,7 +291,7 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) for( unsigned int ipar = 0; ipar < testDriver->nparticle; ++ipar ) { std::stringstream momentumErrors; - for( unsigned int icomp = 0; icomp < mgOnGpu::np4; ++icomp ) + for( unsigned int icomp = 0; icomp < CPPProcess::np4; ++icomp ) { const fptype pMadg = testDriver->getMomentum( ievt, ipar, icomp ); const fptype pOrig = referenceData[iiter].momenta[ievt][ipar][icomp]; @@ -295,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc index da81c99218..81699dfea9 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,12 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -9,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -55,7 +60,7 @@ namespace mg5amcCpu int MatrixElementKernelHost::computeGoodHelicities() { - using mgOnGpu::ncomb; // the number of helicity combinations + constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations HostBufferHelicityMask hstIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the host computeDependentCouplings( m_gs.data(), m_couplings.data(), m_gs.size() ); @@ -107,10 +112,17 @@ namespace mg5amcCpu // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu bool ok = true; // this is just an assumption! const std::string tag = "arm neon (128bit as in SSE4.2)"; -#else +#elif defined( __x86_64__ ) || defined( __i386__ ) bool known = true; bool ok = __builtin_cpu_supports( "sse4.2" ); const std::string tag = "nehalem (SSE4.2)"; +#else // AV FIXME! Added by OM for Mac, should identify the correct __xxx__ flag that should be targeted + bool known = false; // __builtin_cpu_supports is not supported + // See https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions-Available-on-all-Configurations.html + // See https://stackoverflow.com/q/62783908 + // See https://community.arm.com/arm-community-blogs/b/operating-systems-blog/posts/runtime-detection-of-cpu-features-on-an-armv8-a-cpu + bool ok = true; // this is just an assumption! + const std::string tag = "arm neon (128bit as in SSE4.2)"; #endif #else bool known = true; @@ -138,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -193,17 +205,17 @@ namespace mg5amcGpu int MatrixElementKernelDevice::computeGoodHelicities() { - using mgOnGpu::ncomb; // the number of helicity combinations + constexpr int ncomb = CPPProcess::ncomb; // the number of helicity combinations PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -214,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h index ec0fc9b18c..72bd8f195b 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -5,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -76,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -125,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h index f3ab497b7a..ffb76e93de 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessAmplitudes_H #define MemoryAccessAmplitudes_H 1 @@ -9,142 +14,151 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 -//---------------------------------------------------------------------------- - -#ifndef MGONGPU_TRIVIAL_AMPLITUDES - -// A class describing the internal layout of memory buffers for amplitudes -// This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA -// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] -class MemoryAccessAmplitudesBase //_AOSOAv1 +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { -public: - - // Number of Events Per Page in the amplitude AOSOA memory buffer layout - static constexpr int neppA = 1; // AOS (just a test...) - -private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; + //---------------------------------------------------------------------------- - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- +#ifndef MGONGPU_TRIVIAL_AMPLITUDES - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + // A class describing the internal layout of memory buffers for amplitudes + // This implementation uses an AOSOA[npagA][nx2][neppA] where nevt=npagA*neppA + // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] + class MemoryAccessAmplitudesBase //_AOSOAv1 { - const int ipagA = ievt / neppA; // #event "A-page" - const int ieppA = ievt % neppA; // #event in the current event A-page - constexpr int ix2 = 0; - return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) + public: + + // Number of Events Per Page in the amplitude AOSOA memory buffer layout + static constexpr int neppA = 1; // AOS (just a test...) + + private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + // The number of floating point components of a complex number + static constexpr int nx2 = mgOnGpu::nx2; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + const int ipagA = ievt / neppA; // #event "A-page" + const int ieppA = ievt % neppA; // #event in the current event A-page + constexpr int ix2 = 0; + return &( buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA] ); // AOSOA[ipagA][ix2][ieppA] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer, + const int ix2 ) + { + constexpr int ipagA = 0; + constexpr int ieppA = 0; + return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] + } + }; + + //---------------------------------------------------------------------------- + + // A class providing access to memory buffers for a given event, based on explicit event numbers + // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations + class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase { - constexpr int ipagA = 0; - constexpr int ieppA = 0; - return buffer[ipagA * nx2 * neppA + ix2 * neppA + ieppA]; // AOSOA[ipagA][ix2][ieppA] - } -}; - -//---------------------------------------------------------------------------- + public: -// A class providing access to memory buffers for a given event, based on explicit event numbers -// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations -class MemoryAccessAmplitudes : public MemoryAccessAmplitudesBase -{ -public: + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] + static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] + static constexpr auto decodeRecordIx2Const = + MemoryAccessHelper::template decodeRecordConst; - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] + static constexpr auto ieventAccessIx2 = + MemoryAccessHelper::template ieventAccessField; - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; -}; + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] + static constexpr auto ieventAccessIx2Const = + MemoryAccessHelper::template ieventAccessFieldConst; + }; #endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES -//---------------------------------------------------------------------------- + //---------------------------------------------------------------------------- -// A class providing access to memory buffers for a given event, based on implicit kernel rules -// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations -template -class KernelAccessAmplitudes -{ -public: + // A class providing access to memory buffers for a given event, based on implicit kernel rules + // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + template + class KernelAccessAmplitudes + { + public: #ifndef MGONGPU_TRIVIAL_AMPLITUDES - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2 = - KernelAccessHelper::template kernelAccessField; + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] + static constexpr auto kernelAccessIx2 = + KernelAccessHelper::template kernelAccessField; - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const = - KernelAccessHelper::template kernelAccessFieldConst; + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] + static constexpr auto kernelAccessIx2Const = + KernelAccessHelper::template kernelAccessFieldConst; #else - static __host__ __device__ inline cxtype_sv* - kernelAccess( fptype* buffer ) - { - return reinterpret_cast( buffer ); - } + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } - static __host__ __device__ inline const cxtype_sv* - kernelAccessConst( const fptype* buffer ) - { - return reinterpret_cast( buffer ); - } + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) + { + return reinterpret_cast( buffer ); + } #endif // #ifndef MGONGPU_TRIVIAL_AMPLITUDES -}; + }; + + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + typedef KernelAccessAmplitudes HostAccessAmplitudes; + typedef KernelAccessAmplitudes DeviceAccessAmplitudes; -typedef KernelAccessAmplitudes HostAccessAmplitudes; -typedef KernelAccessAmplitudes DeviceAccessAmplitudes; + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +} // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessAmplitudes_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h index 141d24ec71..c3123544c8 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplings.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessCouplings_H #define MemoryAccessCouplings_H 1 @@ -9,248 +14,257 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomentaBase::neppM #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned -//---------------------------------------------------------------------------- - -// A class describing the internal layout of memory buffers for couplings -// This implementation uses an AOSOA[npagC][ndcoup][nx2][neppC] "super-buffer" where nevt=npagC*neppC -// From the "super-buffer" for ndcoup different couplings, use idcoupAccessBuffer to access the buffer for one specific coupling -// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] -class MemoryAccessCouplingsBase //_AOSOAv1 +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { -public: - - // Number of Events Per Page in the coupling AOSOA memory buffer layout - static constexpr int neppC = MemoryAccessMomentaBase::neppM; // use the same AOSOA striding as for momenta - - // SANITY CHECK: check that neppC is a power of two - static_assert( ispoweroftwo( neppC ), "neppC is not a power of 2" ); - - //-------------------------------------------------------------------------- - // ** NB! A single super-buffer AOSOA[npagC][ndcoup][nx2][neppC] includes data for ndcoup different couplings ** - // ** NB! The ieventAccessRecord and kernelAccess functions refer to the buffer for one individual coupling ** - // ** NB! Use idcoupAccessBuffer to add a fixed offset and locate the buffer for one given individual coupling ** - //-------------------------------------------------------------------------- - - // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input) - // [Signature (non-const) ===> fptype* idcoupAccessBuffer( fptype* buffer, const int idcoup ) <===] - // NB: keep this in public even if exposed through KernelAccessCouplings: nvcc says it is inaccesible otherwise? - static __host__ __device__ inline fptype* - idcoupAccessBuffer( fptype* buffer, // input "super-buffer" - const int idcoup ) - { - constexpr int ipagC = 0; - constexpr int ieppC = 0; - constexpr int ix2 = 0; - // NB! this effectively adds an offset "idcoup * nx2 * neppC" - return &( buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC] ); // AOSOA[ipagC][idcoup][ix2][ieppC] - } - - // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input) - // [Signature (const) ===> const fptype* idcoupAccessBufferConst( const fptype* buffer, const int idcoup ) <===] - // NB: keep this in public even if exposed through KernelAccessCouplings: nvcc says it is inaccesible otherwise? - static __host__ __device__ inline const fptype* - idcoupAccessBufferConst( const fptype* buffer, // input "super-buffer" - const int idcoup ) - { - return idcoupAccessBuffer( const_cast( buffer ), idcoup ); - } - -private: + //---------------------------------------------------------------------------- - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of couplings that dependent on the running alphas QCD in this specific process - static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + // A class describing the internal layout of memory buffers for couplings + // This implementation uses an AOSOA[npagC][ndcoup][nx2][neppC] "super-buffer" where nevt=npagC*neppC + // From the "super-buffer" for ndcoup different couplings, use idcoupAccessBuffer to access the buffer for one specific coupling + // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] + class MemoryAccessCouplingsBase //_AOSOAv1 { - const int ipagC = ievt / neppC; // #event "C-page" - const int ieppC = ievt % neppC; // #event in the current event C-page - constexpr int idcoup = 0; - constexpr int ix2 = 0; - return &( buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC] ); // AOSOA[ipagC][idcoup][ix2][ieppC] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ix2 ) + public: + + // Number of Events Per Page in the coupling AOSOA memory buffer layout + static constexpr int neppC = MemoryAccessMomentaBase::neppM; // use the same AOSOA striding as for momenta + + // SANITY CHECK: check that neppC is a power of two + static_assert( ispoweroftwo( neppC ), "neppC is not a power of 2" ); + + //-------------------------------------------------------------------------- + // ** NB! A single super-buffer AOSOA[npagC][ndcoup][nx2][neppC] includes data for ndcoup different couplings ** + // ** NB! The ieventAccessRecord and kernelAccess functions refer to the buffer for one individual coupling ** + // ** NB! Use idcoupAccessBuffer to add a fixed offset and locate the buffer for one given individual coupling ** + //-------------------------------------------------------------------------- + + // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input) + // [Signature (non-const) ===> fptype* idcoupAccessBuffer( fptype* buffer, const int idcoup ) <===] + // NB: keep this in public even if exposed through KernelAccessCouplings: nvcc says it is inaccesible otherwise? + static __host__ __device__ inline fptype* + idcoupAccessBuffer( fptype* buffer, // input "super-buffer" + const int idcoup ) + { + constexpr int ipagC = 0; + constexpr int ieppC = 0; + constexpr int ix2 = 0; + // NB! this effectively adds an offset "idcoup * nx2 * neppC" + return &( buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC] ); // AOSOA[ipagC][idcoup][ix2][ieppC] + } + + // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input) + // [Signature (const) ===> const fptype* idcoupAccessBufferConst( const fptype* buffer, const int idcoup ) <===] + // NB: keep this in public even if exposed through KernelAccessCouplings: nvcc says it is inaccesible otherwise? + static __host__ __device__ inline const fptype* + idcoupAccessBufferConst( const fptype* buffer, // input "super-buffer" + const int idcoup ) + { + return idcoupAccessBuffer( const_cast( buffer ), idcoup ); + } + + private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + // The number of couplings that dependent on the running alphas QCD in this specific process + static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; + + // The number of floating point components of a complex number + static constexpr int nx2 = mgOnGpu::nx2; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + const int ipagC = ievt / neppC; // #event "C-page" + const int ieppC = ievt % neppC; // #event in the current event C-page + constexpr int idcoup = 0; + constexpr int ix2 = 0; + return &( buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC] ); // AOSOA[ipagC][idcoup][ix2][ieppC] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to "const int ix2" and rename "Field" as "Ix2"] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer, + const int ix2 ) + { + constexpr int ipagC = 0; + constexpr int ieppC = 0; + // NB! the offset "idcoup * nx2 * neppC" has been added in idcoupAccessBuffer + constexpr int idcoup = 0; + return buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC]; // AOSOA[ipagC][idcoup][ix2][ieppC] + } + }; + + //---------------------------------------------------------------------------- + + // A class providing access to memory buffers for a given event, based on explicit event numbers + // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations + class MemoryAccessCouplings : public MemoryAccessCouplingsBase { - constexpr int ipagC = 0; - constexpr int ieppC = 0; - // NB! the offset "idcoup * nx2 * neppC" has been added in idcoupAccessBuffer - constexpr int idcoup = 0; - return buffer[ipagC * ndcoup * nx2 * neppC + idcoup * nx2 * neppC + ix2 * neppC + ieppC]; // AOSOA[ipagC][idcoup][ix2][ieppC] - } -}; - -//---------------------------------------------------------------------------- - -// A class providing access to memory buffers for a given event, based on explicit event numbers -// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations -class MemoryAccessCouplings : public MemoryAccessCouplingsBase -{ -public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] - static constexpr auto decodeRecordIx2Const = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] - static constexpr auto ieventAccessIx2Const = - MemoryAccessHelper::template ieventAccessFieldConst; -}; - -//---------------------------------------------------------------------------- - -// A class providing access to memory buffers for a given event, based on implicit kernel rules -// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations -template -class KernelAccessCouplings -{ -public: - - // Expose selected functions from MemoryAccessCouplingsBase - static constexpr auto idcoupAccessBuffer = MemoryAccessCouplingsBase::idcoupAccessBuffer; - static constexpr auto idcoupAccessBufferConst = MemoryAccessCouplingsBase::idcoupAccessBufferConst; - - // Expose selected functions from MemoryAccessCouplings - static constexpr auto ieventAccessRecordConst = MemoryAccessCouplings::ieventAccessRecordConst; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const, SCALAR) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2_s = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const, SCALAR) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static constexpr auto kernelAccessIx2Const_s = - KernelAccessHelper::template kernelAccessFieldConst; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] - static __host__ __device__ inline fptype_sv& - kernelAccessIx2( fptype* buffer, - const int ix2 ) + public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ix2 ) <===] + static constexpr auto decodeRecordIx2 = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ix2 ) <===] + static constexpr auto decodeRecordIx2Const = + MemoryAccessHelper::template decodeRecordConst; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccessIx2( fptype* buffer, const ievt, const int ix2 ) <===] + static constexpr auto ieventAccessIx2 = + MemoryAccessHelper::template ieventAccessField; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessIx2Const( const fptype* buffer, const ievt, const int ix2 ) <===] + static constexpr auto ieventAccessIx2Const = + MemoryAccessHelper::template ieventAccessFieldConst; + }; + + //---------------------------------------------------------------------------- + + // A class providing access to memory buffers for a given event, based on implicit kernel rules + // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + template + class KernelAccessCouplings { - fptype& out = kernelAccessIx2_s( buffer, ix2 ); + public: + + // Expose selected functions from MemoryAccessCouplingsBase + static constexpr auto idcoupAccessBuffer = MemoryAccessCouplingsBase::idcoupAccessBuffer; + static constexpr auto idcoupAccessBufferConst = MemoryAccessCouplingsBase::idcoupAccessBufferConst; + + // Expose selected functions from MemoryAccessCouplings + static constexpr auto ieventAccessRecordConst = MemoryAccessCouplings::ieventAccessRecordConst; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const, SCALAR) ===> fptype& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] + static constexpr auto kernelAccessIx2_s = + KernelAccessHelper::template kernelAccessField; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR) ===> const fptype& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] + static constexpr auto kernelAccessIx2Const_s = + KernelAccessHelper::template kernelAccessFieldConst; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccessIx2( fptype* buffer, const int ix2 ) <===] + static __host__ __device__ inline fptype_sv& + kernelAccessIx2( fptype* buffer, + const int ix2 ) + { + fptype& out = kernelAccessIx2_s( buffer, ix2 ); #ifndef MGONGPU_CPPSIMD - return out; + return out; #else - // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays - constexpr int neppC = MemoryAccessCouplingsBase::neppC; - static_assert( neppC >= neppV ); // ASSUME CONTIGUOUS ARRAYS - static_assert( neppC % neppV == 0 ); // ASSUME CONTIGUOUS ARRAYS - static_assert( mg5amcCpu::HostBufferCouplings::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) - //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) - return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast + // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays + constexpr int neppC = MemoryAccessCouplingsBase::neppC; + static_assert( neppC >= neppV ); // ASSUME CONTIGUOUS ARRAYS + static_assert( neppC % neppV == 0 ); // ASSUME CONTIGUOUS ARRAYS + static_assert( mg5amcCpu::HostBufferCouplings::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast #endif - } + } - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static __host__ __device__ inline const fptype_sv& - kernelAccessIx2Const( const fptype* buffer, - const int ix2 ) - { - return kernelAccessIx2( const_cast( buffer ), ix2 ); - } - - /* - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] - static __host__ __device__ inline const fptype_sv& - kernelAccessIx2Const( const fptype* buffer, - const int ix2 ) - { - const fptype& out = kernelAccessIx2Const_s( buffer, ix2 ); + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] + static __host__ __device__ inline const fptype_sv& + kernelAccessIx2Const( const fptype* buffer, + const int ix2 ) + { + return kernelAccessIx2( const_cast( buffer ), ix2 ); + } + + /* + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccessIx2Const( const fptype* buffer, const int ix2 ) <===] + static __host__ __device__ inline const fptype_sv& + kernelAccessIx2Const( const fptype* buffer, + const int ix2 ) + { + const fptype& out = kernelAccessIx2Const_s( buffer, ix2 ); #ifndef MGONGPU_CPPSIMD - return out; + return out; #else - // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays - constexpr int neppC = MemoryAccessCouplingsBase::neppC; - static_assert( neppC >= neppV ); // ASSUME CONTIGUOUS ARRAYS - static_assert( neppC % neppV == 0 ); // ASSUME CONTIGUOUS ARRAYS - static_assert( mg5amcCpu::HostBufferCouplings::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) - //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) - return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast + // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays + constexpr int neppC = MemoryAccessCouplingsBase::neppC; + static_assert( neppC >= neppV ); // ASSUME CONTIGUOUS ARRAYS + static_assert( neppC % neppV == 0 ); // ASSUME CONTIGUOUS ARRAYS + static_assert( mg5amcCpu::HostBufferCouplings::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast #endif - } - */ - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non const, SCALAR OR VECTOR) ===> cxtype_sv_ref kernelAccess( fptype* buffer ) <===] - static __host__ __device__ inline cxtype_sv_ref - kernelAccess( fptype* buffer ) - { - /* - fptype_sv& real = kernelAccessIx2( buffer, 0 ); - fptype_sv& imag = kernelAccessIx2( buffer, 1 ); - printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); - return cxtype_sv_ref( real, imag ); - */ - return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), - kernelAccessIx2( buffer, 1 ) ); - } - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const, SCALAR OR VECTOR) ===> cxtype_sv kernelAccessConst( const fptype* buffer ) <===] - static __host__ __device__ inline cxtype_sv - kernelAccessConst( const fptype* buffer ) - { - /* - const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); - const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); - printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); - return cxtype_sv( real, imag ); + } */ - return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), - kernelAccessIx2Const( buffer, 1 ) ); - } -}; - -//---------------------------------------------------------------------------- - -typedef KernelAccessCouplings HostAccessCouplings; -typedef KernelAccessCouplings DeviceAccessCouplings; -//---------------------------------------------------------------------------- + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non const, SCALAR OR VECTOR) ===> cxtype_sv_ref kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline cxtype_sv_ref + kernelAccess( fptype* buffer ) + { + /* + fptype_sv& real = kernelAccessIx2( buffer, 0 ); + fptype_sv& imag = kernelAccessIx2( buffer, 1 ); + printf( "C_ACCESS::kernelAccess: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + return cxtype_sv_ref( real, imag ); + */ + return cxtype_sv_ref( kernelAccessIx2( buffer, 0 ), + kernelAccessIx2( buffer, 1 ) ); + } + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR OR VECTOR) ===> cxtype_sv kernelAccessConst( const fptype* buffer ) <===] + static __host__ __device__ inline cxtype_sv + kernelAccessConst( const fptype* buffer ) + { + /* + const fptype_sv& real = kernelAccessIx2Const( buffer, 0 ); + const fptype_sv& imag = kernelAccessIx2Const( buffer, 1 ); + printf( "C_ACCESS::kernelAccessConst: pbuffer=%p pr=%p pi=%p\n", buffer, &real, &imag ); + return cxtype_sv( real, imag ); + */ + return cxtype_sv( kernelAccessIx2Const( buffer, 0 ), + kernelAccessIx2Const( buffer, 1 ) ); + } + }; + + //---------------------------------------------------------------------------- + + typedef KernelAccessCouplings HostAccessCouplings; + typedef KernelAccessCouplings DeviceAccessCouplings; + + //---------------------------------------------------------------------------- + +} // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessCouplings_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h index 0f9850baf2..ffcdf4dbef 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Apr 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessCouplingsFixed_H #define MemoryAccessCouplingsFixed_H 1 @@ -8,63 +13,72 @@ //#include "MemoryAccessHelpers.h" -//---------------------------------------------------------------------------- - -// A class describing the internal layout of memory buffers for fixed couplings -// This implementation uses a STRUCT[ndcoup][nx2] "super-buffer" layout: in practice, the cIPC global array -// From the "super-buffer" for ndcoup different couplings, use idcoupAccessBuffer to access the buffer for one specific coupling -// [If many implementations are used, a suffix _Sv1 should be appended to the class name] -class MemoryAccessCouplingsFixedBase //_Sv1 +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { -public: + //---------------------------------------------------------------------------- - // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input) - // [Signature (const) ===> const fptype* iicoupAccessBufferConst( const fptype* buffer, const int iicoup ) <===] - static __host__ __device__ inline const fptype* - iicoupAccessBufferConst( const fptype* buffer, // input "super-buffer": in practice, the cIPC global array - const int iicoup ) + // A class describing the internal layout of memory buffers for fixed couplings + // This implementation uses a STRUCT[ndcoup][nx2] "super-buffer" layout: in practice, the cIPC global array + // From the "super-buffer" for ndcoup different couplings, use idcoupAccessBuffer to access the buffer for one specific coupling + // [If many implementations are used, a suffix _Sv1 should be appended to the class name] + class MemoryAccessCouplingsFixedBase //_Sv1 { - constexpr int ix2 = 0; - // NB! this effectively adds an offset "iicoup * nx2" - return &( buffer[iicoup * nx2 + ix2] ); // STRUCT[idcoup][ix2] - } - -private: - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; -}; - -//---------------------------------------------------------------------------- + public: + + // Locate the buffer for a single coupling (output) in a memory super-buffer (input) from the given coupling index (input) + // [Signature (const) ===> const fptype* iicoupAccessBufferConst( const fptype* buffer, const int iicoup ) <===] + static __host__ __device__ inline const fptype* + iicoupAccessBufferConst( const fptype* buffer, // input "super-buffer": in practice, the cIPC global array + const int iicoup ) + { + constexpr int ix2 = 0; + // NB! this effectively adds an offset "iicoup * nx2" + return &( buffer[iicoup * nx2 + ix2] ); // STRUCT[idcoup][ix2] + } + + private: + + // The number of floating point components of a complex number + static constexpr int nx2 = mgOnGpu::nx2; + }; + + //---------------------------------------------------------------------------- + + // A class providing access to memory buffers for a given event, based on implicit kernel rules + // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + template + class KernelAccessCouplingsFixed + { + public: -// A class providing access to memory buffers for a given event, based on implicit kernel rules -// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations -template -class KernelAccessCouplingsFixed -{ -public: + // Expose selected functions from MemoryAccessCouplingsFixedBase + static constexpr auto iicoupAccessBufferConst = MemoryAccessCouplingsFixedBase::iicoupAccessBufferConst; - // Expose selected functions from MemoryAccessCouplingsFixedBase - static constexpr auto iicoupAccessBufferConst = MemoryAccessCouplingsFixedBase::iicoupAccessBufferConst; + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR OR VECTOR) ===> cxtype_sv kernelAccessConst( const fptype* buffer ) <===] + static __host__ __device__ inline const cxtype_sv + kernelAccessConst( const fptype* buffer ) + { + // TRIVIAL ACCESS to fixed-couplings buffers! + //return cxmake( fptype_sv{ buffer[0] }, fptype_sv{ buffer[1] } ); // NO! BUG #339! + const fptype_sv r_sv = fptype_sv{ 0 } + buffer[0]; + const fptype_sv i_sv = fptype_sv{ 0 } + buffer[1]; + return cxmake( r_sv, i_sv ); // ugly but effective + } + }; - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const, SCALAR OR VECTOR) ===> cxtype_sv kernelAccessConst( const fptype* buffer ) <===] - static __host__ __device__ inline const cxtype_sv - kernelAccessConst( const fptype* buffer ) - { - // TRIVIAL ACCESS to fixed-couplings buffers! - //return cxmake( fptype_sv{ buffer[0] }, fptype_sv{ buffer[1] } ); // NO! BUG #339! - const fptype_sv r_sv = fptype_sv{ 0 } + buffer[0]; - const fptype_sv i_sv = fptype_sv{ 0 } + buffer[1]; - return cxmake( r_sv, i_sv ); // ugly but effective - } -}; + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + typedef KernelAccessCouplingsFixed HostAccessCouplingsFixed; + typedef KernelAccessCouplingsFixed DeviceAccessCouplingsFixed; -typedef KernelAccessCouplingsFixed HostAccessCouplingsFixed; -typedef KernelAccessCouplingsFixed DeviceAccessCouplingsFixed; + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +} // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessCouplingsFixed_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessDenominators.h index 7a4a80ebd9..66f2d32a6b 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessDenominators.h @@ -1,18 +1,32 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (May 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessDenominators_H #define MemoryAccessDenominators_H 1 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessGs.h" -//---------------------------------------------------------------------------- +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //---------------------------------------------------------------------------- + + // A class describing the internal layout of memory buffers for denominators + // This implementation reuses the plain ARRAY[nevt] implementation of MemoryAccessGs -// A class describing the internal layout of memory buffers for denominators -// This implementation reuses the plain ARRAY[nevt] implementation of MemoryAccessGs + typedef KernelAccessGs HostAccessDenominators; + typedef KernelAccessGs DeviceAccessDenominators; -typedef KernelAccessGs HostAccessDenominators; -typedef KernelAccessGs DeviceAccessDenominators; + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +} // end namespace mg5amcGpu/mg5amcCpu #endif #endif // MemoryAccessDenominators_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h index f233d64b9c..4c726b30f3 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessGs.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessGs_H #define MemoryAccessGs_H 1 @@ -7,142 +12,151 @@ #include "MemoryAccessVectors.h" #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned -//---------------------------------------------------------------------------- - -// A class describing the internal layout of memory buffers for Gs -// This implementation uses a plain ARRAY[nevt] -// [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name] -class MemoryAccessGsBase //_ARRAYv1 +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { -private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - return &( buffer[ievt] ); // ARRAY[nevt] - } + //---------------------------------------------------------------------------- - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer ) + // A class describing the internal layout of memory buffers for Gs + // This implementation uses a plain ARRAY[nevt] + // [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name] + class MemoryAccessGsBase //_ARRAYv1 { - constexpr int ievt = 0; - return buffer[ievt]; // ARRAY[nevt] - } -}; - -//---------------------------------------------------------------------------- - -// A class providing access to memory buffers for a given event, based on explicit event numbers -// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations -class MemoryAccessGs : public MemoryAccessGsBase -{ -public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===] - static constexpr auto decodeRecord = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===] - static constexpr auto decodeRecordConst = - MemoryAccessHelper::template decodeRecordConst<>; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===] - static constexpr auto ieventAccess = - MemoryAccessHelper::template ieventAccessField<>; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===] - static constexpr auto ieventAccessConst = - MemoryAccessHelper::template ieventAccessFieldConst<>; -}; - -//---------------------------------------------------------------------------- - -// A class providing access to memory buffers for a given event, based on implicit kernel rules -// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations -template -class KernelAccessGs -{ -public: - - // Expose selected functions from MemoryAccessGs - static constexpr auto ieventAccessRecord = MemoryAccessGs::ieventAccessRecord; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const, SCALAR) ===> fptype& kernelAccess( fptype* buffer ) <===] - static constexpr auto kernelAccess_s = - KernelAccessHelper::template kernelAccessField<>; // requires cuda 11.4 - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) - // [Signature (non-const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccess( fptype* buffer ) <===] - static __host__ __device__ inline fptype_sv& - kernelAccess( fptype* buffer ) + private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + return &( buffer[ievt] ); // ARRAY[nevt] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer ) + { + constexpr int ievt = 0; + return buffer[ievt]; // ARRAY[nevt] + } + }; + + //---------------------------------------------------------------------------- + + // A class providing access to memory buffers for a given event, based on explicit event numbers + // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations + class MemoryAccessGs : public MemoryAccessGsBase + { + public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===] + static constexpr auto decodeRecord = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===] + static constexpr auto decodeRecordConst = + MemoryAccessHelper::template decodeRecordConst<>; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===] + static constexpr auto ieventAccess = + MemoryAccessHelper::template ieventAccessField<>; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===] + static constexpr auto ieventAccessConst = + MemoryAccessHelper::template ieventAccessFieldConst<>; + }; + + //---------------------------------------------------------------------------- + + // A class providing access to memory buffers for a given event, based on implicit kernel rules + // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + template + class KernelAccessGs { - fptype& out = kernelAccess_s( buffer ); + public: + + // Expose selected functions from MemoryAccessGs + static constexpr auto ieventAccessRecord = MemoryAccessGs::ieventAccessRecord; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const, SCALAR) ===> fptype& kernelAccess( fptype* buffer ) <===] + static constexpr auto kernelAccess_s = + KernelAccessHelper::template kernelAccessField<>; // requires cuda 11.4 + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (non-const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccess( fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv& + kernelAccess( fptype* buffer ) + { + fptype& out = kernelAccess_s( buffer ); #ifndef MGONGPU_CPPSIMD - return out; + return out; #else - // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435) - static_assert( mg5amcCpu::HostBufferGs::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) - //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) - return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast + // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435) + static_assert( mg5amcCpu::HostBufferGs::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast #endif - } - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] - static constexpr auto kernelAccessConst_s = - KernelAccessHelper::template kernelAccessFieldConst<>; // requires cuda 11.4 - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) - // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccess( const fptype* buffer ) <===] - static __host__ __device__ inline const fptype_sv& - kernelAccessConst( const fptype* buffer ) - { - const fptype& out = kernelAccessConst_s( buffer ); + } + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] + static constexpr auto kernelAccessConst_s = + KernelAccessHelper::template kernelAccessFieldConst<>; // requires cuda 11.4 + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (const, SCALAR OR VECTOR) ===> const fptype_sv& kernelAccess( const fptype* buffer ) <===] + static __host__ __device__ inline const fptype_sv& + kernelAccessConst( const fptype* buffer ) + { + const fptype& out = kernelAccessConst_s( buffer ); #ifndef MGONGPU_CPPSIMD - return out; + return out; #else - // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435) - static_assert( mg5amcCpu::HostBufferGs::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) - //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) - return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast + // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435) + static_assert( mg5amcCpu::HostBufferGs::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast #endif - } -}; + } + }; + + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + typedef KernelAccessGs HostAccessGs; + typedef KernelAccessGs DeviceAccessGs; -typedef KernelAccessGs HostAccessGs; -typedef KernelAccessGs DeviceAccessGs; + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +} // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessGs_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessHelpers.h index aa3016c9a1..db73e4e064 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -100,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h index 05f0810807..3741011971 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessMatrixElements_H #define MemoryAccessMatrixElements_H 1 @@ -7,126 +12,135 @@ #include "MemoryAccessVectors.h" #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned -//---------------------------------------------------------------------------- - -// A class describing the internal layout of memory buffers for matrix elements -// This implementation uses a plain ARRAY[nevt] -// [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name] -class MemoryAccessMatrixElementsBase //_ARRAYv1 +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { -private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - return &( buffer[ievt] ); // ARRAY[nevt] - } - - //-------------------------------------------------------------------------- + //---------------------------------------------------------------------------- - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer ) + // A class describing the internal layout of memory buffers for matrix elements + // This implementation uses a plain ARRAY[nevt] + // [If many implementations are used, a suffix _ARRAYv1 should be appended to the class name] + class MemoryAccessMatrixElementsBase //_ARRAYv1 { - constexpr int ievt = 0; - return buffer[ievt]; // ARRAY[nevt] - } -}; - -//---------------------------------------------------------------------------- - -// A class providing access to memory buffers for a given event, based on explicit event numbers -// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations -class MemoryAccessMatrixElements : public MemoryAccessMatrixElementsBase -{ -public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===] - static constexpr auto decodeRecord = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===] - static constexpr auto decodeRecordConst = - MemoryAccessHelper::template decodeRecordConst<>; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===] - static constexpr auto ieventAccess = - MemoryAccessHelper::template ieventAccessField<>; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===] - static constexpr auto ieventAccessConst = - MemoryAccessHelper::template ieventAccessFieldConst<>; -}; - -//---------------------------------------------------------------------------- - -// A class providing access to memory buffers for a given event, based on implicit kernel rules -// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations -template -class KernelAccessMatrixElements -{ -public: - - // Expose selected functions from MemoryAccessMatrixElements - static constexpr auto ieventAccessRecord = MemoryAccessMatrixElements::ieventAccessRecord; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const, SCALAR) ===> fptype& kernelAccess_s( fptype* buffer ) <===] - static constexpr auto kernelAccess_s = - KernelAccessHelper::template kernelAccessField<>; // requires cuda 11.4 - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) - // [Signature (non const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccess( const fptype* buffer ) <===] - static __host__ __device__ inline fptype_sv& - kernelAccess( fptype* buffer ) + private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + return &( buffer[ievt] ); // ARRAY[nevt] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to empty and rename "Field" as empty] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer ) + { + constexpr int ievt = 0; + return buffer[ievt]; // ARRAY[nevt] + } + }; + + //---------------------------------------------------------------------------- + + // A class providing access to memory buffers for a given event, based on explicit event numbers + // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations + class MemoryAccessMatrixElements : public MemoryAccessMatrixElementsBase + { + public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer ) <===] + static constexpr auto decodeRecord = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer ) <===] + static constexpr auto decodeRecordConst = + MemoryAccessHelper::template decodeRecordConst<>; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccess( fptype* buffer, const ievt ) <===] + static constexpr auto ieventAccess = + MemoryAccessHelper::template ieventAccessField<>; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessConst( const fptype* buffer, const ievt ) <===] + static constexpr auto ieventAccessConst = + MemoryAccessHelper::template ieventAccessFieldConst<>; + }; + + //---------------------------------------------------------------------------- + + // A class providing access to memory buffers for a given event, based on implicit kernel rules + // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + template + class KernelAccessMatrixElements { - fptype& out = kernelAccess_s( buffer ); + public: + + // Expose selected functions from MemoryAccessMatrixElements + static constexpr auto ieventAccessRecord = MemoryAccessMatrixElements::ieventAccessRecord; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const, SCALAR) ===> fptype& kernelAccess_s( fptype* buffer ) <===] + static constexpr auto kernelAccess_s = + KernelAccessHelper::template kernelAccessField<>; // requires cuda 11.4 + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) + // [Signature (non const, SCALAR OR VECTOR) ===> fptype_sv& kernelAccess( const fptype* buffer ) <===] + static __host__ __device__ inline fptype_sv& + kernelAccess( fptype* buffer ) + { + fptype& out = kernelAccess_s( buffer ); #ifndef MGONGPU_CPPSIMD - return out; + return out; #else - // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435) - static_assert( mg5amcCpu::HostBufferMatrixElements::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) - //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) - return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast + // NB: derived from MemoryAccessMomenta, restricting the implementation to contiguous aligned arrays (#435) + static_assert( mg5amcCpu::HostBufferMatrixElements::isaligned() ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + //assert( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ); // ASSUME ALIGNED ARRAYS (reinterpret_cast will segfault otherwise!) + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast #endif - } + } + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] + static constexpr auto kernelAccessConst = + KernelAccessHelper::template kernelAccessFieldConst<>; // requires cuda 11.4 + }; - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessConst( const fptype* buffer ) <===] - static constexpr auto kernelAccessConst = - KernelAccessHelper::template kernelAccessFieldConst<>; // requires cuda 11.4 -}; + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + typedef KernelAccessMatrixElements HostAccessMatrixElements; + typedef KernelAccessMatrixElements DeviceAccessMatrixElements; -typedef KernelAccessMatrixElements HostAccessMatrixElements; -typedef KernelAccessMatrixElements DeviceAccessMatrixElements; + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +} // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessMatrixElements_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMomenta.h index ace50b40e8..3be229d392 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,260 +1,275 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" #include "MemoryAccessHelpers.h" #include "MemoryAccessVectors.h" -//---------------------------------------------------------------------------- - -// A class describing the internal layout of memory buffers for momenta -// This implementation uses an AOSOA[npagM][npar][np4][neppM] where nevt=npagM*neppM -// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] -class MemoryAccessMomentaBase //_AOSOAv1 +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { -public: - - // Number of Events Per Page in the momenta AOSOA memory buffer layout - // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ - // ----------------------------------------------------------------------------------------------- - // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline - // --- This is relevant to ensure coalesced access to momenta in global memory - // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms - // ----------------------------------------------------------------------------------------------- - //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT) - static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT) - //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu) + //---------------------------------------------------------------------------- + + // A class describing the internal layout of memory buffers for momenta + // This implementation uses an AOSOA[npagM][npar][np4][neppM] where nevt=npagM*neppM + // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] + class MemoryAccessMomentaBase //_AOSOAv1 + { + public: + + // Number of Events Per Page in the momenta AOSOA memory buffer layout + // (these are all best kept as a compile-time constants: see issue #23) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + // ----------------------------------------------------------------------------------------------- + // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline + // --- This is relevant to ensure coalesced access to momenta in global memory + // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms + // ----------------------------------------------------------------------------------------------- + //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT) + static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT) + //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu) #else - // ----------------------------------------------------------------------------------------------- - // --- CPUs: neppM is best set equal to the number of fptype's (neppV) in a vector register - // --- This is relevant to ensure faster access to momenta from C++ memory cache lines - // --- However, neppM is now decoupled from neppV (issue #176) and can be separately hardcoded - // --- In practice, neppR, neppM and neppV could now (in principle) all be different - // ----------------------------------------------------------------------------------------------- + // ----------------------------------------------------------------------------------------------- + // --- CPUs: neppM is best set equal to the number of fptype's (neppV) in a vector register + // --- This is relevant to ensure faster access to momenta from C++ memory cache lines + // --- However, neppM is now decoupled from neppV (issue #176) and can be separately hardcoded + // --- In practice, neppR, neppM and neppV could now (in principle) all be different + // ----------------------------------------------------------------------------------------------- #ifdef MGONGPU_CPPSIMD - static constexpr int neppM = MGONGPU_CPPSIMD; // (DEFAULT) neppM=neppV for optimal performance - //static constexpr int neppM = 64/sizeof(fptype); // maximum CPU vector width (512 bits): 8 (DOUBLE) or 16 (FLOAT) - //static constexpr int neppM = 32/sizeof(fptype); // lower CPU vector width (256 bits): 4 (DOUBLE) or 8 (FLOAT) - //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 4.66E6 instead of 5.09E9 in eemumu) - //static constexpr int neppM = MGONGPU_CPPSIMD*2; // FOR TESTS + static constexpr int neppM = MGONGPU_CPPSIMD; // (DEFAULT) neppM=neppV for optimal performance + //static constexpr int neppM = 64/sizeof(fptype); // maximum CPU vector width (512 bits): 8 (DOUBLE) or 16 (FLOAT) + //static constexpr int neppM = 32/sizeof(fptype); // lower CPU vector width (256 bits): 4 (DOUBLE) or 8 (FLOAT) + //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 4.66E6 instead of 5.09E9 in eemumu) + //static constexpr int neppM = MGONGPU_CPPSIMD*2; // FOR TESTS #else - static constexpr int neppM = 1; // (DEFAULT) neppM=neppV for optimal performance (NB: this is equivalent to AOS) + static constexpr int neppM = 1; // (DEFAULT) neppM=neppV for optimal performance (NB: this is equivalent to AOS) #endif #endif /* clang-format on */ - // SANITY CHECK: check that neppM is a power of two - static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); + // SANITY CHECK: check that neppM is a power of two + static_assert( ispoweroftwo( neppM ), "neppM is not a power of 2" ); -private: + private: - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; - // The number of components of a 4-momentum - static constexpr int np4 = mgOnGpu::np4; + // The number of components of a 4-momentum + static constexpr int np4 = CPPProcess::np4; - // The number of particles in this physics process - static constexpr int npar = mgOnGpu::npar; + // The number of particles in this physics process + static constexpr int npar = CPPProcess::npar; - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) - { - const int ipagM = ievt / neppM; // #event "M-page" - const int ieppM = ievt % neppM; // #event in the current event M-page - constexpr int ip4 = 0; - constexpr int ipar = 0; - return &( buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM] ); // AOSOA[ipagM][ipar][ip4][ieppM] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int ip4, const int ipar" and rename "Field" as "Ip4Ipar"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int ip4, - const int ipar ) + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + const int ipagM = ievt / neppM; // #event "M-page" + const int ieppM = ievt % neppM; // #event in the current event M-page + constexpr int ip4 = 0; + constexpr int ipar = 0; + return &( buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM] ); // AOSOA[ipagM][ipar][ip4][ieppM] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to "const int ip4, const int ipar" and rename "Field" as "Ip4Ipar"] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer, + const int ip4, + const int ipar ) + { + constexpr int ipagM = 0; + constexpr int ieppM = 0; + return buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM]; // AOSOA[ipagM][ipar][ip4][ieppM] + } + }; + + //---------------------------------------------------------------------------- + + // A class providing access to memory buffers for a given event, based on explicit event numbers + // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations + class MemoryAccessMomenta : public MemoryAccessMomentaBase { - constexpr int ipagM = 0; - constexpr int ieppM = 0; - return buffer[ipagM * npar * np4 * neppM + ipar * np4 * neppM + ip4 * neppM + ieppM]; // AOSOA[ipagM][ipar][ip4][ieppM] - } -}; + public: + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ipar, const int ipar ) <===] + static constexpr auto decodeRecordIp4Ipar = MemoryAccessHelper::decodeRecord; + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ipar, const int ipar ) <===] + static constexpr auto decodeRecordIp4IparConst = + MemoryAccessHelper::template decodeRecordConst; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccessIp4Ipar( fptype* buffer, const ievt, const int ipar, const int ipar ) <===] + static constexpr auto ieventAccessIp4Ipar = + MemoryAccessHelper::template ieventAccessField; + + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===] + // DEFAULT VERSION + static constexpr auto ieventAccessIp4IparConst = + MemoryAccessHelper::template ieventAccessFieldConst; + + /* + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===] + // DEBUG VERSION WITH PRINTOUTS + static __host__ __device__ inline const fptype& + ieventAccessIp4IparConst( const fptype* buffer, + const int ievt, + const int ip4, + const int ipar ) + { + const fptype& out = MemoryAccessHelper::template ieventAccessFieldConst( buffer, ievt, ip4, ipar ); + printf( "ipar=%2d ip4=%2d ievt=%8d out=%8.3f\n", ipar, ip4, ievt, out ); + return out; + } + */ + }; -//---------------------------------------------------------------------------- + //---------------------------------------------------------------------------- -// A class providing access to memory buffers for a given event, based on explicit event numbers -// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations -class MemoryAccessMomenta : public MemoryAccessMomentaBase -{ -public: - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int ipar, const int ipar ) <===] - static constexpr auto decodeRecordIp4Ipar = MemoryAccessHelper::decodeRecord; - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int ipar, const int ipar ) <===] - static constexpr auto decodeRecordIp4IparConst = - MemoryAccessHelper::template decodeRecordConst; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIp4Ipar( fptype* buffer, const ievt, const int ipar, const int ipar ) <===] - static constexpr auto ieventAccessIp4Ipar = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===] - // DEFAULT VERSION - static constexpr auto ieventAccessIp4IparConst = - MemoryAccessHelper::template ieventAccessFieldConst; - - /* - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIp4IparConst( const fptype* buffer, const ievt, const int ipar, const int ipar ) <===] - // DEBUG VERSION WITH PRINTOUTS - static __host__ __device__ inline const fptype& - ieventAccessIp4IparConst( const fptype* buffer, - const int ievt, - const int ip4, - const int ipar ) + // A class providing access to memory buffers for a given event, based on implicit kernel rules + // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + template + class KernelAccessMomenta { - const fptype& out = MemoryAccessHelper::template ieventAccessFieldConst( buffer, ievt, ip4, ipar ); - printf( "ipar=%2d ip4=%2d ievt=%8d out=%8.3f\n", ipar, ip4, ievt, out ); - return out; - } - */ -}; - -//---------------------------------------------------------------------------- - -// A class providing access to memory buffers for a given event, based on implicit kernel rules -// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations -template -class KernelAccessMomenta -{ -public: - - // Expose selected functions from MemoryAccessMomenta - static constexpr auto ieventAccessRecordConst = MemoryAccessMomenta::ieventAccessRecordConst; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const, SCALAR) ===> fptype& kernelAccessIp4Ipar( fptype* buffer, const int ipar, const int ipar ) <===] - static constexpr auto kernelAccessIp4Ipar = - KernelAccessHelper::template kernelAccessField; - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===] - // DEFAULT VERSION - static constexpr auto kernelAccessIp4IparConst_s = - KernelAccessHelper::template kernelAccessFieldConst; - - /* - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===] - // DEBUG VERSION WITH PRINTOUTS - static __host__ __device__ inline const fptype& - kernelAccessIp4IparConst_s( const fptype* buffer, + public: + + // Expose selected functions from MemoryAccessMomenta + static constexpr auto ieventAccessRecordConst = MemoryAccessMomenta::ieventAccessRecordConst; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const, SCALAR) ===> fptype& kernelAccessIp4Ipar( fptype* buffer, const int ipar, const int ipar ) <===] + static constexpr auto kernelAccessIp4Ipar = + KernelAccessHelper::template kernelAccessField; + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===] + // DEFAULT VERSION + static constexpr auto kernelAccessIp4IparConst_s = + KernelAccessHelper::template kernelAccessFieldConst; + + /* + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR) ===> const fptype& kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===] + // DEBUG VERSION WITH PRINTOUTS + static __host__ __device__ inline const fptype& + kernelAccessIp4IparConst_s( const fptype* buffer, + const int ip4, + const int ipar ) + { + const fptype& out = KernelAccessHelper::template kernelAccessFieldConst( buffer, ip4, ipar ); + printf( "ipar=%2d ip4=%2d ievt='kernel' out=%8.3f\n", ipar, ip4, out ); + return out; + } + */ + + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const, SCALAR OR VECTOR) ===> fptype_sv kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===] + // FIXME? Eventually return by const reference and support aligned arrays only? + // FIXME? Currently return by value to support also unaligned and arbitrary arrays + static __host__ __device__ inline fptype_sv + kernelAccessIp4IparConst( const fptype* buffer, const int ip4, const int ipar ) - { - const fptype& out = KernelAccessHelper::template kernelAccessFieldConst( buffer, ip4, ipar ); - printf( "ipar=%2d ip4=%2d ievt='kernel' out=%8.3f\n", ipar, ip4, out ); - return out; - } - */ - - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const, SCALAR OR VECTOR) ===> fptype_sv kernelAccessIp4IparConst( const fptype* buffer, const int ipar, const int ipar ) <===] - // FIXME? Eventually return by const reference and support aligned arrays only? - // FIXME? Currently return by value to support also unaligned and arbitrary arrays - static __host__ __device__ inline fptype_sv - kernelAccessIp4IparConst( const fptype* buffer, - const int ip4, - const int ipar ) - { - const fptype& out = kernelAccessIp4IparConst_s( buffer, ip4, ipar ); + { + const fptype& out = kernelAccessIp4IparConst_s( buffer, ip4, ipar ); #ifndef MGONGPU_CPPSIMD - return out; + return out; #else - constexpr int neppM = MemoryAccessMomentaBase::neppM; - constexpr bool useContiguousEventsIfPossible = true; // DEFAULT - //constexpr bool useContiguousEventsIfPossible = false; // FOR PERFORMANCE TESTS (treat as arbitrary array even if it is an AOSOA) - // Use c++17 "if constexpr": compile-time branching - if constexpr( useContiguousEventsIfPossible && ( neppM >= neppV ) && ( neppM % neppV == 0 ) ) - { - //constexpr bool skipAlignmentCheck = true; // FASTEST (SEGFAULTS IF MISALIGNED ACCESS, NEEDS A SANITY CHECK ELSEWHERE!) - constexpr bool skipAlignmentCheck = false; // DEFAULT: A BIT SLOWER BUT SAFER [ALLOWS MISALIGNED ACCESS] - if constexpr( skipAlignmentCheck ) - { - //static bool first=true; if( first ){ std::cout << "WARNING! assume aligned AOSOA, skip check" << std::endl; first=false; } // SLOWER (5.06E6) - // FASTEST? (5.09E6 in eemumu 512y) - // This assumes alignment for momenta1d without checking - causes segmentation fault in reinterpret_cast if not aligned! - return mg5amcCpu::fptypevFromAlignedArray( out ); // use reinterpret_cast - } - else if( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ) + constexpr int neppM = MemoryAccessMomentaBase::neppM; + constexpr bool useContiguousEventsIfPossible = true; // DEFAULT + //constexpr bool useContiguousEventsIfPossible = false; // FOR PERFORMANCE TESTS (treat as arbitrary array even if it is an AOSOA) + // Use c++17 "if constexpr": compile-time branching + if constexpr( useContiguousEventsIfPossible && ( neppM >= neppV ) && ( neppM % neppV == 0 ) ) { - //static bool first=true; if( first ){ std::cout << "WARNING! aligned AOSOA, reinterpret cast" << std::endl; first=false; } // SLOWER (5.00E6) - // DEFAULT! A tiny bit (<1%) slower because of the alignment check (5.07E6 in eemumu 512y) - // This explicitly checks buffer alignment to avoid segmentation faults in reinterpret_cast - return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast + //constexpr bool skipAlignmentCheck = true; // FASTEST (SEGFAULTS IF MISALIGNED ACCESS, NEEDS A SANITY CHECK ELSEWHERE!) + constexpr bool skipAlignmentCheck = false; // DEFAULT: A BIT SLOWER BUT SAFER [ALLOWS MISALIGNED ACCESS] + if constexpr( skipAlignmentCheck ) + { + //static bool first=true; if( first ){ std::cout << "WARNING! assume aligned AOSOA, skip check" << std::endl; first=false; } // SLOWER (5.06E6) + // FASTEST? (5.09E6 in eemumu 512y) + // This assumes alignment for momenta1d without checking - causes segmentation fault in reinterpret_cast if not aligned! + return mg5amcCpu::fptypevFromAlignedArray( out ); // use reinterpret_cast + } + else if( (size_t)( buffer ) % mgOnGpu::cppAlign == 0 ) + { + //static bool first=true; if( first ){ std::cout << "WARNING! aligned AOSOA, reinterpret cast" << std::endl; first=false; } // SLOWER (5.00E6) + // DEFAULT! A tiny bit (<1%) slower because of the alignment check (5.07E6 in eemumu 512y) + // This explicitly checks buffer alignment to avoid segmentation faults in reinterpret_cast + return mg5amcCpu::fptypevFromAlignedArray( out ); // SIMD bulk load of neppV, use reinterpret_cast + } + else + { + //static bool first=true; if( first ){ std::cout << "WARNING! AOSOA but no reinterpret cast" << std::endl; first=false; } // SLOWER (4.93E6) + // A bit (1%) slower (5.05E6 in eemumu 512y) + // This does not require buffer alignment, but it requires AOSOA with neppM>=neppV and neppM%neppV==0 + return mg5amcCpu::fptypevFromUnalignedArray( out ); // SIMD bulk load of neppV, do not use reinterpret_cast (fewer SIMD operations) + } } else { - //static bool first=true; if( first ){ std::cout << "WARNING! AOSOA but no reinterpret cast" << std::endl; first=false; } // SLOWER (4.93E6) - // A bit (1%) slower (5.05E6 in eemumu 512y) - // This does not require buffer alignment, but it requires AOSOA with neppM>=neppV and neppM%neppV==0 - return mg5amcCpu::fptypevFromUnalignedArray( out ); // SIMD bulk load of neppV, do not use reinterpret_cast (fewer SIMD operations) + //static bool first=true; if( first ){ std::cout << "WARNING! arbitrary array" << std::endl; first=false; } // SLOWER (5.08E6) + // ?!Used to be much slower, now a tiny bit faster for AOSOA?! (5.11E6 for AOSOA, 4.64E6 for AOS in eemumu 512y) + // This does not even require AOSOA with neppM>=neppV and neppM%neppV==0 (e.g. can be used with AOS neppM==1) + constexpr int ievt0 = 0; // just make it explicit in the code that buffer refers to a given ievt0 and decoderIeppV fetches event ievt0+ieppV + auto decoderIeppv = [buffer, ip4, ipar]( int ieppV ) + -> const fptype& + { return MemoryAccessMomenta::ieventAccessIp4IparConst( buffer, ievt0 + ieppV, ip4, ipar ); }; + return mg5amcCpu::fptypevFromArbitraryArray( decoderIeppv ); // iterate over ieppV in neppV (no SIMD) } +#endif } - else + + // Is this a HostAccess or DeviceAccess class? + // [this is only needed for a warning printout in rambo.h for nparf==1 #358] + static __host__ __device__ inline constexpr bool + isOnDevice() { - //static bool first=true; if( first ){ std::cout << "WARNING! arbitrary array" << std::endl; first=false; } // SLOWER (5.08E6) - // ?!Used to be much slower, now a tiny bit faster for AOSOA?! (5.11E6 for AOSOA, 4.64E6 for AOS in eemumu 512y) - // This does not even require AOSOA with neppM>=neppV and neppM%neppV==0 (e.g. can be used with AOS neppM==1) - constexpr int ievt0 = 0; // just make it explicit in the code that buffer refers to a given ievt0 and decoderIeppV fetches event ievt0+ieppV - auto decoderIeppv = [buffer, ip4, ipar]( int ieppV ) - -> const fptype& - { return MemoryAccessMomenta::ieventAccessIp4IparConst( buffer, ievt0 + ieppV, ip4, ipar ); }; - return mg5amcCpu::fptypevFromArbitraryArray( decoderIeppv ); // iterate over ieppV in neppV (no SIMD) + return onDevice; } -#endif - } + }; - // Is this a HostAccess or DeviceAccess class? - // [this is only needed for a warning printout in rambo.h for nparf==1 #358] - static __host__ __device__ inline constexpr bool - isOnDevice() - { - return onDevice; - } -}; + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + typedef KernelAccessMomenta HostAccessMomenta; + typedef KernelAccessMomenta DeviceAccessMomenta; -typedef KernelAccessMomenta HostAccessMomenta; -typedef KernelAccessMomenta DeviceAccessMomenta; + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +} // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessMomenta_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessNumerators.h index e5f81381a9..18991f4fa6 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessNumerators.h @@ -1,18 +1,32 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (May 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessNumerators_H #define MemoryAccessNumerators_H 1 #ifdef MGONGPU_SUPPORTS_MULTICHANNEL #include "MemoryAccessGs.h" -//---------------------------------------------------------------------------- +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //---------------------------------------------------------------------------- + + // A class describing the internal layout of memory buffers for numerators + // This implementation reuses the plain ARRAY[nevt] implementation of MemoryAccessGs -// A class describing the internal layout of memory buffers for numerators -// This implementation reuses the plain ARRAY[nevt] implementation of MemoryAccessGs + typedef KernelAccessGs HostAccessNumerators; + typedef KernelAccessGs DeviceAccessNumerators; -typedef KernelAccessGs HostAccessNumerators; -typedef KernelAccessGs DeviceAccessNumerators; + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +} // end namespace mg5amcGpu/mg5amcCpu #endif #endif // MemoryAccessNumerators_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h index a7ff24243f..40cb089135 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,10 +1,22 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 #include "mgOnGpuConfig.h" +#include "CPPProcess.h" #include "MemoryAccessHelpers.h" +#ifdef MGONGPUCPP_GPUIMPL +using mg5amcGpu::CPPProcess; +#else +using mg5amcCpu::CPPProcess; +#endif + //---------------------------------------------------------------------------- // A class describing the internal layout of memory buffers for random numbers @@ -27,10 +39,10 @@ class MemoryAccessRandomNumbersBase //_AOSOAv1 friend class KernelAccessHelper; // The number of components of a 4-momentum - static constexpr int np4 = mgOnGpu::np4; + static constexpr int np4 = CPPProcess::np4; // The number of final state particles in this physics process - static constexpr int nparf = mgOnGpu::nparf; + static constexpr int nparf = CPPProcess::nparf; //-------------------------------------------------------------------------- // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessVectors.h index 2697cdad52..08faccff0f 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessVectors.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -5,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h index 738eef9a02..33bef4559e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessWavefunctions_H #define MemoryAccessWavefunctions_H 1 @@ -9,147 +14,156 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 -//---------------------------------------------------------------------------- - -#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - -// A class describing the internal layout of memory buffers for wavefunctions -// This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW -// [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] -class MemoryAccessWavefunctionsBase //_AOSOAv1 +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { -public: - - // Number of Events Per Page in the wavefunction AOSOA memory buffer layout - static constexpr int neppW = 1; // AOS (just a test...) - -private: - - friend class MemoryAccessHelper; - friend class KernelAccessHelper; - friend class KernelAccessHelper; - - // The number of components of a (fermion or vector) wavefunction - static constexpr int nw6 = mgOnGpu::nw6; - - // The number of floating point components of a complex number - static constexpr int nx2 = mgOnGpu::nx2; + //---------------------------------------------------------------------------- - //-------------------------------------------------------------------------- - // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" - // (in other words: first locate the event record for a given event, then locate an element in that record) - //-------------------------------------------------------------------------- +#ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static __host__ __device__ inline fptype* - ieventAccessRecord( fptype* buffer, - const int ievt ) + // A class describing the internal layout of memory buffers for wavefunctions + // This implementation uses an AOSOA[npagW][nw6][nx2][neppW] where nevt=npagW*neppW + // [If many implementations are used, a suffix _AOSOAv1 should be appended to the class name] + class MemoryAccessWavefunctionsBase //_AOSOAv1 { - const int ipagW = ievt / neppW; // #event "W-page" - const int ieppW = ievt % neppW; // #event in the current event W-page - constexpr int iw6 = 0; - constexpr int ix2 = 0; - return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] - } - - //-------------------------------------------------------------------------- - - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] - // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] - static __host__ __device__ inline fptype& - decodeRecord( fptype* buffer, - const int iw6, - const int ix2 ) + public: + + // Number of Events Per Page in the wavefunction AOSOA memory buffer layout + static constexpr int neppW = 1; // AOS (just a test...) + + private: + + friend class MemoryAccessHelper; + friend class KernelAccessHelper; + friend class KernelAccessHelper; + + // The number of components of a (fermion or vector) wavefunction + static constexpr int nw6 = mgOnGpu::nw6; + + // The number of floating point components of a complex number + static constexpr int nx2 = mgOnGpu::nx2; + + //-------------------------------------------------------------------------- + // NB all KernelLaunchers assume that memory access can be decomposed as "accessField = decodeRecord( accessRecord )" + // (in other words: first locate the event record for a given event, then locate an element in that record) + //-------------------------------------------------------------------------- + + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static __host__ __device__ inline fptype* + ieventAccessRecord( fptype* buffer, + const int ievt ) + { + const int ipagW = ievt / neppW; // #event "W-page" + const int ieppW = ievt % neppW; // #event in the current event W-page + constexpr int iw6 = 0; + constexpr int ix2 = 0; + return &( buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW] ); // AOSOA[ipagW][iw6][ix2][ieppW] + } + + //-------------------------------------------------------------------------- + + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, Ts... args ) <===] + // [NB: expand variadic template "Ts... args" to "const int iw6, const int ix2" and rename "Field" as "Iw6Ix2"] + static __host__ __device__ inline fptype& + decodeRecord( fptype* buffer, + const int iw6, + const int ix2 ) + { + constexpr int ipagW = 0; + constexpr int ieppW = 0; + return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] + } + }; + + //---------------------------------------------------------------------------- + + // A class providing access to memory buffers for a given event, based on explicit event numbers + // Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations + class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase { - constexpr int ipagW = 0; - constexpr int ieppW = 0; - return buffer[ipagW * nw6 * nx2 * neppW + iw6 * nx2 * neppW + ix2 * neppW + ieppW]; // AOSOA[ipagW][iw6][ix2][ieppW] - } -}; - -//---------------------------------------------------------------------------- + public: -// A class providing access to memory buffers for a given event, based on explicit event numbers -// Its methods use the MemoryAccessHelper templates - note the use of the template keyword in template function instantiations -class MemoryAccessWavefunctions : public MemoryAccessWavefunctionsBase -{ -public: + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (non-const) ===> fptype* ieventAccessRecord( fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecord = MemoryAccessHelper::ieventAccessRecord; + // Locate an event record (output) in a memory buffer (input) from the given event number (input) + // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] + static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; - // Locate an event record (output) in a memory buffer (input) from the given event number (input) - // [Signature (const) ===> const fptype* ieventAccessRecordConst( const fptype* buffer, const int ievt ) <===] - static constexpr auto ieventAccessRecordConst = MemoryAccessHelper::ieventAccessRecordConst; + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] + static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (non-const) ===> fptype& decodeRecord( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2 = MemoryAccessHelper::decodeRecord; + // Locate a field (output) of an event record (input) from the given field indexes (input) + // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] + static constexpr auto decodeRecordIw6Ix2Const = + MemoryAccessHelper::template decodeRecordConst; - // Locate a field (output) of an event record (input) from the given field indexes (input) - // [Signature (const) ===> const fptype& decodeRecordConst( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto decodeRecordIw6Ix2Const = - MemoryAccessHelper::template decodeRecordConst; + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] + static constexpr auto ieventAccessIw6Ix2 = + MemoryAccessHelper::template ieventAccessField; - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (non-const) ===> fptype& ieventAccessIw6Ix2( fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2 = - MemoryAccessHelper::template ieventAccessField; - - // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) - // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] - static constexpr auto ieventAccessIw6Ix2Const = - MemoryAccessHelper::template ieventAccessFieldConst; -}; + // Locate a field (output) in a memory buffer (input) from the given event number (input) and the given field indexes (input) + // [Signature (const) ===> const fptype& ieventAccessIw6Ix2Const( const fptype* buffer, const ievt, const int iw6, const int ix2 ) <===] + static constexpr auto ieventAccessIw6Ix2Const = + MemoryAccessHelper::template ieventAccessFieldConst; + }; #endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS -//---------------------------------------------------------------------------- + //---------------------------------------------------------------------------- -// A class providing access to memory buffers for a given event, based on implicit kernel rules -// Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations -template -class KernelAccessWavefunctions -{ -public: + // A class providing access to memory buffers for a given event, based on implicit kernel rules + // Its methods use the KernelAccessHelper template - note the use of the template keyword in template function instantiations + template + class KernelAccessWavefunctions + { + public: #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2 = - KernelAccessHelper::template kernelAccessField; + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (non-const) ===> fptype& kernelAccessIw6Ix2( fptype* buffer, const int iw6, const int ix2 ) <===] + static constexpr auto kernelAccessIw6Ix2 = + KernelAccessHelper::template kernelAccessField; - // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) - // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] - static constexpr auto kernelAccessIw6Ix2Const = - KernelAccessHelper::template kernelAccessFieldConst; + // Locate a field (output) in a memory buffer (input) from a kernel event-indexing mechanism (internal) and the given field indexes (input) + // [Signature (const) ===> const fptype& kernelAccessIw6Ix2Const( const fptype* buffer, const int iw6, const int ix2 ) <===] + static constexpr auto kernelAccessIw6Ix2Const = + KernelAccessHelper::template kernelAccessFieldConst; #else - static __host__ __device__ inline cxtype_sv* - kernelAccess( fptype* buffer ) - { - return reinterpret_cast( buffer ); - } + static __host__ __device__ inline cxtype_sv* + kernelAccess( fptype* buffer ) + { + return reinterpret_cast( buffer ); + } - static __host__ __device__ inline const cxtype_sv* - kernelAccessConst( const fptype* buffer ) - { - return reinterpret_cast( buffer ); - } + static __host__ __device__ inline const cxtype_sv* + kernelAccessConst( const fptype* buffer ) + { + return reinterpret_cast( buffer ); + } #endif // #ifndef MGONGPU_TRIVIAL_WAVEFUNCTIONS -}; + }; + + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- + typedef KernelAccessWavefunctions HostAccessWavefunctions; + typedef KernelAccessWavefunctions DeviceAccessWavefunctions; -typedef KernelAccessWavefunctions HostAccessWavefunctions; -typedef KernelAccessWavefunctions DeviceAccessWavefunctions; + //---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- +} // end namespace mg5amcGpu/mg5amcCpu #endif // MemoryAccessWavefunctions_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWeights.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWeights.h index 3915657657..5dd2107ce0 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWeights.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryAccessWeights.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryAccessWeights_H #define MemoryAccessWeights_H 1 diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h index 5775c59793..9a62f1a3bf 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -5,12 +10,13 @@ #include "mgOnGpuCxtypes.h" -#include "CudaRuntime.h" +#include "CPPProcess.h" +#include "GpuRuntime.h" #include "Parameters_MSSM_SLHA2.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -18,14 +24,15 @@ namespace mg5amcCpu { //-------------------------------------------------------------------------- - // TEMPORARY? Take this from a PhysicsProcess class? Define them here directly in codegen? namespace MemoryBuffers { - static constexpr size_t np4 = mgOnGpu::np4; - static constexpr size_t nparf = mgOnGpu::nparf; - static constexpr size_t npar = mgOnGpu::npar; - static constexpr size_t nw6 = mgOnGpu::nw6; + // Process-independent compile-time constants + static constexpr size_t np4 = CPPProcess::np4; + static constexpr size_t nw6 = CPPProcess::nw6; static constexpr size_t nx2 = mgOnGpu::nx2; + // Process-dependent compile-time constants + static constexpr size_t nparf = CPPProcess::nparf; + static constexpr size_t npar = CPPProcess::npar; static constexpr size_t ndcoup = Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; } @@ -80,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -112,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -121,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -141,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -168,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -184,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -206,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -225,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -250,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -269,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -289,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -308,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -326,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -345,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -363,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -378,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -396,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -414,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -432,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -450,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -468,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -480,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -497,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -520,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CMakeLists.txt b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CMakeLists.txt index 4ac6c179d3..a8ef043c95 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CMakeLists.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CMakeLists.txt @@ -1,3 +1,8 @@ +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Roiser (2022-2023) for the MG5aMC CUDACPP plugin. + get_filename_component(basename ${CMAKE_CURRENT_SOURCE_DIR} NAME) string(TOLOWER ${basename} targadd) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index a32c83489a..909f063728 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -1,6 +1,13 @@ +// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. +// Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. +//========================================================================== +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26 +// MadGraph5_aMC@NLO v. 3.5.3_lo_vect, 2023-12-23 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -9,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_MSSM_SLHA2.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -39,18 +45,18 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { - using mgOnGpu::np4; // dimensions of 4-momenta (E,px,py,pz) - using mgOnGpu::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - using mgOnGpu::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + constexpr int ncomb = CPPProcess::ncomb; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - using mgOnGpu::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) - using mgOnGpu::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] + //using CPPProcess::nwf; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) using Parameters_MSSM_SLHA2_dependentCouplings::ndcoup; // #couplings that vary event by event (depend on running alphas QCD) using Parameters_MSSM_SLHA2_independentCouplings::nicoup; // #couplings that are fixed for all events (do not depend on running alphas QCD) @@ -73,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -83,13 +89,13 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; - __device__ __constant__ int cNGoodHel; // FIXME: assume process.nprocesses == 1 for the moment (eventually cNGoodHel[nprocesses]?) + __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; #else static short cHel[ncomb][npar]; - static int cNGoodHel; // FIXME: assume process.nprocesses == 1 for the moment (eventually cNGoodHel[nprocesses]?) + static int cNGoodHel; static int cGoodHel[ncomb]; #endif @@ -111,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -144,10 +150,14 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif + // The variable nwf (which is specific to each P1 subdirectory, #644) is only used here + // It is hardcoded here because various attempts to hardcode it in CPPProcess.h at generation time gave the wrong result... + static const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + // Local TEMPORARY variables for a subset of Feynman diagrams in the given CUDA event (ievt) or C++ event page (ipagV) // [NB these variables are reused several times (and re-initialised each time) within the same event or event page] // ** NB: in other words, amplitudes and wavefunctions still have TRIVIAL ACCESS: there is currently no need @@ -176,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -189,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -235,10 +247,10 @@ namespace mg5amcCpu ixxxxx( momenta, cIPD[0], cHel[ihel][3], -1, w_fp[3], 3 ); - VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 0., 0., w_fp[4] ); + VVV1P0_1( w_fp[0], w_fp[1], COUPs[0], 1.0, 0., 0., w_fp[4] ); // Amplitude(s) for diagram number 1 - FFV1_0( w_fp[3], w_fp[2], w_fp[4], -COUPs[1], &_fp[0] ); + FFV1_0( w_fp[3], w_fp[2], w_fp[4], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) #endif @@ -248,10 +260,10 @@ namespace mg5amcCpu // *** DIAGRAM 2 OF 3 *** // Wavefunction(s) for diagram number 2 - FFV1_1( w_fp[2], w_fp[0], -COUPs[1], cIPD[0], cIPD[1], w_fp[4] ); + FFV1_1( w_fp[2], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] ); // Amplitude(s) for diagram number 2 - FFV1_0( w_fp[3], w_fp[4], w_fp[1], -COUPs[1], &_fp[0] ); + FFV1_0( w_fp[3], w_fp[4], w_fp[1], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) #endif @@ -260,10 +272,10 @@ namespace mg5amcCpu // *** DIAGRAM 3 OF 3 *** // Wavefunction(s) for diagram number 3 - FFV1_2( w_fp[3], w_fp[0], -COUPs[1], cIPD[0], cIPD[1], w_fp[4] ); + FFV1_2( w_fp[3], w_fp[0], COUPs[1], -1.0, cIPD[0], cIPD[1], w_fp[4] ); // Amplitude(s) for diagram number 3 - FFV1_0( w_fp[4], w_fp[2], w_fp[1], -COUPs[1], &_fp[0] ); + FFV1_0( w_fp[4], w_fp[2], w_fp[1], COUPs[1], -1.0, &_fp[0] ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Here the code base generated with multichannel support updates numerators_sv and denominators_sv (#473) #endif @@ -288,7 +300,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -345,7 +357,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -397,7 +409,6 @@ namespace mg5amcCpu // *** STORE THE RESULTS *** // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) - // FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?) fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); MEs_sv += deltaMEs; // fix #435 #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -405,7 +416,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -435,7 +446,7 @@ namespace mg5amcCpu { // Helicities for the process [NB do keep 'static' for this constexpr array, see issue #283] // *** NB There is no automatic check yet that these are in the same order as Fortran! #569 *** - static constexpr short tHel[ncomb][mgOnGpu::npar] = { + static constexpr short tHel[ncomb][npar] = { { -1, -1, -1, 1 }, { -1, -1, -1, -1 }, { -1, -1, 1, 1 }, @@ -452,10 +463,10 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else - memcpy( cHel, tHel, ncomb * mgOnGpu::npar * sizeof( short ) ); + memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif } @@ -493,9 +504,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -531,7 +542,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -560,6 +571,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -573,6 +588,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -596,12 +613,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -622,7 +639,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -632,13 +649,12 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif bool* isGoodHel ) // output: isGoodHel[ncomb] - device array (CUDA implementation) - { /* clang-format on */ - // FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?) - fptype allMEsLast = 0; + { /* clang-format on */ const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid - allMEs[ievt] = 0; for( int ihel = 0; ihel < ncomb; ihel++ ) { + // NEW IMPLEMENTATION OF GETGOODHEL (#630): RESET THE RUNNING SUM OVER HELICITIES TO 0 BEFORE ADDING A NEW HELICITY + allMEs[ievt] = 0; // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering #ifdef MGONGPU_SUPPORTS_MULTICHANNEL @@ -647,12 +663,11 @@ namespace mg5amcCpu #else calculate_wavefunctions( ihel, allmomenta, allcouplings, allMEs, jamp2_sv ); #endif - if( allMEs[ievt] != allMEsLast ) + if( allMEs[ievt] != 0 ) // NEW IMPLEMENTATION OF GETGOODHEL (#630): COMPARE EACH HELICITY CONTRIBUTION TO 0 { //if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl; isGoodHel[ihel] = true; } - allMEsLast = allMEs[ievt]; // running sum up to helicity ihel for event ievt } } #else @@ -671,20 +686,11 @@ namespace mg5amcCpu //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS] // Allocate arrays at build time to contain at least 16 events (or at least neppV events if neppV>16, e.g. in future VPUs) constexpr int maxtry0 = std::max( 16, neppV ); // 16, but at least neppV (otherwise the npagV loop does not even start) - fptype allMEsLast[maxtry0] = { 0 }; // allocated at build time: maxtry0 must be a constexpr // Loop over only nevt events if nevt is < 16 (note that nevt is always >= neppV) assert( nevt >= neppV ); const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt allMEs[nevt*nprocesses]?) __global__ void /* clang-format off */ sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] @@ -782,19 +791,27 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { mgDebugInitialise(); + // SANITY CHECKS for cudacpp code generation (see issues #272 and #343 and PRs #619, #626, #360, #396 and #754) + // These variable are not used anywhere else in the code and their scope is limited to this sanity check + { + // nprocesses == 2 may happen for "mirror processes" such as P0_uux_ttx within pp_tt012j (see PR #754) + constexpr int nprocesses = 1; + static_assert( nprocesses == 1 || nprocesses == 2, "Assume nprocesses == 1 or 2" ); + constexpr int process_id = 1; // code generation source: standalone_cudacpp + static_assert( process_id == 1, "Assume process_id == 1" ); + } + // Denominators: spins, colors and identical particles - constexpr int nprocesses = 1; - static_assert( nprocesses == 1, "Assume nprocesses == 1" ); // FIXME (#343): assume nprocesses == 1 - constexpr int helcolDenominators[1] = { 256 }; + constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -808,10 +825,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event - // FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -838,9 +857,8 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) - // FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here#ifdef __CUDACC__ -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -871,23 +889,26 @@ namespace mg5amcCpu } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // Event-by-event random choice of color #402 - const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0 - fptype targetamp[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp[icolC] = 0; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { - if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0 + fptype targetamp[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - break; + if( icolC == 0 ) + targetamp[icolC] = 0; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ) ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + break; + } } } #endif @@ -982,57 +1003,60 @@ namespace mg5amcCpu #endif } #ifdef MGONGPU_SUPPORTS_MULTICHANNEL // multichannel enabled (random color choice) - const int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0 // Event-by-event random choice of color #402 - fptype_sv targetamp[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) + if( channelId != 0 ) // no event-by-event choice of color if channelId == 0 (fix FPE #783) { - if( icolC == 0 ) - targetamp[icolC] = fptype_sv{ 0 }; - else - targetamp[icolC] = targetamp[icolC - 1]; - if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; - } + const unsigned int channelIdC = channelId - 1; // coloramps.h uses the C array indexing starting at 0 + fptype_sv targetamp[ncolor] = { 0 }; + for( int icolC = 0; icolC < ncolor; icolC++ ) + { + if( icolC == 0 ) + targetamp[icolC] = fptype_sv{ 0 }; + else + targetamp[icolC] = targetamp[icolC - 1]; + if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp[icolC] += jamp2_sv[icolC]; + } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - fptype_sv targetamp2[ncolor] = { 0 }; - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( icolC == 0 ) - targetamp2[icolC] = fptype_sv{ 0 }; - else - targetamp2[icolC] = targetamp2[icolC - 1]; - if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; - } -#endif - for( int ieppV = 0; ieppV < neppV; ++ieppV ) - { - const int ievt = ievt00 + ieppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + fptype_sv targetamp2[ncolor] = { 0 }; for( int icolC = 0; icolC < ncolor; icolC++ ) { + if( icolC == 0 ) + targetamp2[icolC] = fptype_sv{ 0 }; + else + targetamp2[icolC] = targetamp2[icolC - 1]; + if( mgOnGpu::icolamp[channelIdC][icolC] ) targetamp2[icolC] += jamp2_sv[ncolor + icolC]; + } +#endif + for( int ieppV = 0; ieppV < neppV; ++ieppV ) + { + const int ievt = ievt00 + ieppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt, allrndcol[ievt] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) + { #if defined MGONGPU_CPPSIMD - const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); + const bool okcol = allrndcol[ievt] < ( targetamp[icolC][ieppV] / targetamp[ncolor - 1][ieppV] ); #else - const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); + const bool okcol = allrndcol[ievt] < ( targetamp[icolC] / targetamp[ncolor - 1] ); #endif - if( okcol ) - { - allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - break; + if( okcol ) + { + allselcol[ievt] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + break; + } } - } #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT - const int ievt2 = ievt00 + ieppV + neppV; - //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); - for( int icolC = 0; icolC < ncolor; icolC++ ) - { - if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + const int ievt2 = ievt00 + ieppV + neppV; + //printf( "sigmaKin: ievt=%4d rndcol=%f\n", ievt2, allrndcol[ievt2] ); + for( int icolC = 0; icolC < ncolor; icolC++ ) { - allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] - break; + if( allrndcol[ievt2] < ( targetamp2[icolC][ieppV] / targetamp2[ncolor - 1][ieppV] ) ) + { + allselcol[ievt2] = icolC + 1; // NB Fortran [1,ncolor], cudacpp [0,ncolor-1] + break; + } } - } #endif + } } #endif // multichannel enabled (random color choice) } @@ -1044,11 +1068,10 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] - // FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?) -#ifdef __CUDACC__ - allMEs[ievt] /= helcolDenominators[0]; // FIXME (#343): assume nprocesses == 1 +#ifdef MGONGPUCPP_GPUIMPL + allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; // FIXME (#343): assume nprocesses == 1 + if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; #endif #else for( int ipagV = 0; ipagV < npagV; ++ipagV ) @@ -1056,7 +1079,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; fptype* MEs = E_ACCESS::ieventAccessRecord( allMEs, ievt0 ); fptype_sv& MEs_sv = E_ACCESS::kernelAccess( MEs ); - MEs_sv /= helcolDenominators[0]; // FIXME (#343): assume nprocesses == 1 + MEs_sv /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) { @@ -1064,7 +1087,7 @@ namespace mg5amcCpu fptype* denominators = DEN_ACCESS::ieventAccessRecord( allDenominators, ievt0 ); fptype_sv& numerators_sv = NUM_ACCESS::kernelAccess( numerators ); fptype_sv& denominators_sv = DEN_ACCESS::kernelAccess( denominators ); - MEs_sv *= numerators_sv / denominators_sv; // FIXME (#343): assume nprocesses == 1 + MEs_sv *= numerators_sv / denominators_sv; } #endif //for( int ieppV = 0; ieppV < neppV; ieppV++ ) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h index 76e0e2bdf3..91ef862eda 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.h @@ -1,6 +1,13 @@ +// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. +// Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. +//========================================================================== +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26 +// MadGraph5_aMC@NLO v. 3.5.3_lo_vect, 2023-12-23 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -18,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -56,17 +63,32 @@ namespace mg5amcCpu //bool verbose() const { return m_verbose; } bool debug() const { return m_debug; } - public: /* clang-format on */ + public: + + // Process-independent compile-time constants + static constexpr int np4 = 4; // dimensions of 4-momenta (E,px,py,pz) + static constexpr int nw6 = 6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + + // Process-dependent compile-time constants + static constexpr int npari = 2; // #particles in the initial state (incoming): e.g. 2 (e+ e-) for e+ e- -> mu+ mu- + static constexpr int nparf = 2; // #particles in the final state (outgoing): e.g. 2 (mu+ mu-) for e+ e- -> mu+ mu- + static constexpr int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- + static constexpr int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) // Hardcoded parameters for this process (constant class variables) - //static const int ninitial = mgOnGpu::npari; - //static const int nexternal = 4; // mgOnGpu::npar (nexternal was nioparticles) - //static const int nprocesses = 1; // FIXME: assume process.nprocesses == 1 - //static const int nwavefuncs = 6; // mgOnGpu::nwf + // [NB: this class assumes nprocesses==1 i.e. a single DSIG1 and no DSIG2 in Fortran (#272 and #343)] + // [NB: these parameters (e.g. nwf) are P1-specific, i.e. they are different for different P1 subdirectories (#644)] + // [NB: I am currently unable to get the right value of nwf in CPPProcess.h - will hardcode it in CPPProcess.cc instead (#644)] + //static const int nwf = ??; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) + + // Other variables of this instance (???) + //static const int ninitial = CPPProcess::npari; + //static const int nexternal = 4; // CPPProcess::npar (nexternal was nioparticles) + //static const int nwavefuncs = 6; // (?!?! this should be nwf but export_cpp gives the wrong value here) //static const int namplitudes = 3; - //static const int ncomb = 16; // mgOnGpu::ncomb + //static const int ncomb = 16; // CPPProcess::ncomb - private: + private: /* clang-format on */ // Command line arguments (constructor) bool m_verbose; @@ -85,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -98,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -128,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CudaRuntime.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.cc deleted file mode 120000 index 09a0e03a16..0000000000 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/RandomNumberKernels.cc +++ /dev/null @@ -1 +0,0 @@ -../RandomNumberKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/check_sa.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/check_sa.cc index f91ee8ebfb..e086ae12d9 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/check_sa.cc @@ -1,8 +1,18 @@ +// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. +// Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. +//========================================================================== +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2024) for the MG5aMC CUDACPP plugin. +//========================================================================== + #include "mgOnGpuConfig.h" #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -19,7 +29,9 @@ #include #include +#include // for feenableexcept #include +#include // for signal and SIGFPE #include #include #include @@ -46,7 +58,7 @@ int usage( char* argv0, int ret = 1 ) { std::cout << "Usage: " << argv0 - << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--common] [--rmbhst|--rmbdev] [--bridge]" + << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j] [--curhst|--curdev|--hirhst|--hirdev|--common] [--rmbhst|--rmbdev] [--bridge]" << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl; std::cout << std::endl; std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl; @@ -54,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -64,16 +76,45 @@ usage( char* argv0, int ret = 1 ) return ret; } +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + inline void FPEhandler( int sig ) + { +#ifdef MGONGPUCPP_GPUIMPL + std::cerr << "Floating Point Exception (GPU)" << std::endl; +#else + std::cerr << "Floating Point Exception (CPU)" << std::endl; +#endif + exit( 0 ); + } +} + int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; #endif + // Enable FPEs (test #701 and #733 - except on MacOS where feenableexcept is not defined #730) +#ifndef __APPLE__ + const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" ); + const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" ); + if( enableFPE ) + { + std::cout << "WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions" << std::endl; + feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701 + signal( SIGFPE, FPEhandler ); + } +#endif + // DEFAULTS FOR COMMAND LINE ARGUMENTS bool verbose = false; bool debug = false; @@ -90,15 +131,31 @@ main( int argc, char** argv ) enum class RandomNumberMode { CommonRandom = 0, - CurandHost = 1, - CurandDevice = 2 + CurandHost = -1, + CurandDevice = 1, + HiprandHost = -2, + HiprandDevice = 2 }; -#ifdef __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU -#elif not defined MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#if defined __CUDACC__ +#ifndef MGONGPU_HAS_NO_CURAND + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on NVidia GPU if build has no curand (PR #784 and #785) +#endif +#elif defined __HIPCC__ +#ifndef MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on AMD GPU if build has hiprand +#else + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on AMD GPU if build has no hiprand +#endif +#else +#ifndef MGONGPU_HAS_NO_CURAND + RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand +#elif not defined MGONGPU_HAS_NO_HIPRAND + RandomNumberMode rndgen = RandomNumberMode::HiprandDevice; // default on CPU if build has hiprand #else - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has neither curand nor hiprand +#endif #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -106,10 +163,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -136,18 +193,40 @@ main( int argc, char** argv ) } else if( arg == "--curdev" ) { -#ifdef __CUDACC__ - rndgen = RandomNumberMode::CurandDevice; +#ifndef __CUDACC__ + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); +#elif defined MGONGPU_HAS_NO_CURAND + throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + rndgen = RandomNumberMode::CurandDevice; #endif } else if( arg == "--curhst" ) { -#ifndef MGONGPU_HAS_NO_CURAND +#ifdef MGONGPU_HAS_NO_CURAND + throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); +#else rndgen = RandomNumberMode::CurandHost; +#endif + } + else if( arg == "--hirdev" ) + { +#ifndef __HIPCC__ + throw std::runtime_error( "HiprandDevice is not supported on CPUs or non-AMD GPUs" ); +#elif defined MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandDevice is not supported because this application was built without Hiprand support" ); #else - throw std::runtime_error( "CurandHost is not supported because this application was built without Curand support" ); + rndgen = RandomNumberMode::HiprandDevice; +#endif + } + else if( arg == "--hirhst" ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "HiprandHost is not supported because this application was built without Hiprand support" ); +#else + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //rndgen = RandomNumberMode::HiprandHost; #endif } else if( arg == "--common" ) @@ -156,7 +235,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -220,6 +299,20 @@ main( int argc, char** argv ) #endif } + if( rmbsmp == RamboSamplingMode::RamboHost && rndgen == RandomNumberMode::HiprandDevice ) + { +#if not defined MGONGPU_HAS_NO_HIPRAND + // See https://github.com/ROCm/hipRAND/issues/76 + //std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use HiprandHost" << std::endl; + //rndgen = RandomNumberMode::HiprandHost; + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet)" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#else + std::cout << "WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom" << std::endl; + rndgen = RandomNumberMode::CommonRandom; +#endif + } + constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout constexpr int neppR = MemoryAccessRandomNumbers::neppR; // AOSOA layout @@ -230,13 +323,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -254,36 +347,36 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process const std::string procKey = "0a ProcInit"; timermap.start( procKey ); - // Create a process object + // Create a process object, read param card and set parameters + // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? (for instance, in bridge mode this will be called twice here?) CPPProcess process( verbose ); - - // Read param_card and set parameters process.initProc( "../../Cards/param_card.dat" ); const fptype energy = 1500; // historical default, Ecms = 1500 GeV = 1.5 TeV (above the Z peak) //const fptype energy = 91.2; // Ecms = 91.2 GeV (Z peak) //const fptype energy = 0.100; // Ecms = 100 MeV (well below the Z peak, pure em scattering) - const int meGeVexponent = -( 2 * mgOnGpu::npar - 8 ); + const int meGeVexponent = -( 2 * CPPProcess::npar - 8 ); // --- 0b. Allocate memory structures const std::string alloKey = "0b MemAlloc"; timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -291,7 +384,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -299,7 +392,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -307,7 +400,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -324,7 +417,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -333,7 +426,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -342,7 +435,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -350,7 +443,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -358,7 +451,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -370,7 +463,7 @@ main( int argc, char** argv ) std::unique_ptr wavetimes( new double[niter] ); std::unique_ptr wv3atimes( new double[niter] ); - // --- 0c. Create curand or common generator + // --- 0c. Create curand, hiprand or common generator const std::string cgenKey = "0c GenCreat"; timermap.start( cgenKey ); // Allocate the appropriate RandomNumberKernel @@ -379,30 +472,48 @@ main( int argc, char** argv ) { prnk.reset( new CommonRandomNumberKernel( hstRndmom ) ); } -#ifndef MGONGPU_HAS_NO_CURAND else if( rndgen == RandomNumberMode::CurandHost ) { +#ifdef MGONGPU_HAS_NO_CURAND + throw std::runtime_error( "INTERNAL ERROR! CurandHost is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) +#else const bool onDevice = false; prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif } -#ifdef __CUDACC__ - else + else if( rndgen == RandomNumberMode::CurandDevice ) { +#ifdef MGONGPU_HAS_NO_CURAND + throw std::runtime_error( "INTERNAL ERROR! CurandDevice is not supported because this application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __CUDACC__ const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); - } #else - else - { - throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) - } + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif -#else - else + } + else if( rndgen == RandomNumberMode::HiprandHost ) { - throw std::logic_error( "This application was built without Curand support" ); // INTERNAL ERROR (no path to this statement) +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandHost is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#else + const bool onDevice = false; + prnk.reset( new HiprandRandomNumberKernel( hstRndmom, onDevice ) ); +#endif } + else if( rndgen == RandomNumberMode::HiprandDevice ) + { +#ifdef MGONGPU_HAS_NO_HIPRAND + throw std::runtime_error( "INTERNAL ERROR! HiprandDevice is not supported because this application was built without Hiprand support" ); // INTERNAL ERROR (no path to this statement) +#elif defined __HIPCC__ + const bool onDevice = true; + prnk.reset( new HiprandRandomNumberKernel( devRndmom, onDevice ) ); +#else + throw std::logic_error( "INTERNAL ERROR! HiprandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif + } + else + throw std::logic_error( "INTERNAL ERROR! Unknown rndgen value?" ); // INTERNAL ERROR (no path to this statement) // --- 0c. Create rambo sampling kernel [keep this in 0c for the moment] std::unique_ptr prsk; @@ -412,7 +523,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -423,7 +534,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -431,7 +542,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -456,7 +567,7 @@ main( int argc, char** argv ) // *** START THE OLD-STYLE TIMER FOR RANDOM GEN *** double genrtime = 0; - // --- 1a. Seed rnd generator (to get same results on host and device in curand) + // --- 1a. Seed rnd generator (to get same results on host and device in curand/hiprand) // [NB This should not be necessary using the host API: "Generation functions // can be called multiple times on the same generator to generate successive // blocks of results. For pseudorandom generators, multiple calls to generation @@ -473,8 +584,10 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ - if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) +#ifdef MGONGPUCPP_GPUIMPL + if( rndgen != RandomNumberMode::CurandDevice && + rndgen != RandomNumberMode::HiprandDevice && + rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device const std::string htodKey = "1c CpHTDrnd"; @@ -505,7 +618,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -550,7 +663,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -579,7 +692,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -618,7 +731,7 @@ main( int argc, char** argv ) { // Display momenta std::cout << "Momenta:" << std::endl; - for( int ipar = 0; ipar < mgOnGpu::npar; ipar++ ) + for( int ipar = 0; ipar < CPPProcess::npar; ipar++ ) { // NB: 'setw' affects only the next field (of any type) std::cout << std::scientific // fixed format: affects all floats (default precision: 6) @@ -633,7 +746,7 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '-' ) << std::endl; // Display matrix elements std::cout << " Matrix element = " << MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ) - << " GeV^" << meGeVexponent << std::endl; // FIXME: assume process.nprocesses == 1 + << " GeV^" << meGeVexponent << std::endl; std::cout << std::string( SEP79, '-' ) << std::endl; } } @@ -720,20 +833,28 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; + else if( rndgen == RandomNumberMode::HiprandHost ) + rndgentxt = "ROCRAND HOST"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + rndgentxt = "ROCRAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -744,7 +865,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -755,6 +876,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -762,7 +889,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #endif // -- COMMON or CURAND HOST or CURAND DEVICE random numbers? if( rndgen == RandomNumberMode::CommonRandom ) @@ -771,6 +898,10 @@ main( int argc, char** argv ) wrkflwtxt += "CURHST+"; else if( rndgen == RandomNumberMode::CurandDevice ) wrkflwtxt += "CURDEV+"; + else if( rndgen == RandomNumberMode::HiprandHost ) + wrkflwtxt += "HIRHST+"; + else if( rndgen == RandomNumberMode::HiprandDevice ) + wrkflwtxt += "HIRDEV+"; else wrkflwtxt += "??????+"; // no path to this statement // -- HOST or DEVICE rambo sampling? @@ -780,7 +911,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -812,7 +943,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -836,7 +967,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -857,6 +988,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -883,21 +1016,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -928,14 +1063,14 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif #endif //<< "MatrixElements compiler = " << process.getCompiler() << std::endl << std::string( SEP79, '-' ) << std::endl - << "HelicityComb Good/Tot = " << nGoodHel << "/" << mgOnGpu::ncomb << std::endl + << "HelicityComb Good/Tot = " << nGoodHel << "/" << CPPProcess::ncomb << std::endl << std::string( SEP79, '-' ) << std::endl << "NumberOfEntries = " << niter << std::endl << std::scientific // fixed format: affects all floats (default precision: 6) @@ -955,7 +1090,7 @@ main( int argc, char** argv ) //<< "StdDevTimeInMECalcOnly = ( " << stdw3atim << std::string(16, ' ') << " ) sec" << std::endl << std::string( SEP79, '-' ) << std::endl //<< "ProcessID: = " << getpid() << std::endl - //<< "NProcesses = " << process.nprocesses << std::endl + //<< "NProcesses = " << process.nprocesses << std::endl // assume nprocesses == 1 (#272 and #343) << "TotalEventsComputed = " << nevtALL << std::endl << "EvtsPerSec[Rnd+Rmb+ME](123) = ( " << nevtALL / ( sumgtim + sumrtim + sumwtim ) << std::string( 16, ' ' ) << " ) sec^-1" << std::endl @@ -1024,14 +1159,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1039,10 +1176,10 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif - << "\"Curand generation\": " + << "\"Random generation\": " << "\"" << rndgentxt << "\"," << std::endl; double minelem = hstStats.minME; @@ -1070,7 +1207,7 @@ main( int argc, char** argv ) << "\"MaxTimeInMatrixElems\": \"" << std::to_string( maxwtim ) << " sec\"," << std::endl //<< "ProcessID: = " << getpid() << std::endl - //<< "NProcesses = " << process.nprocesses << std::endl + //<< "NProcesses = " << process.nprocesses << std::endl // assume nprocesses == 1 (#272 and #343) << "\"TotalEventsComputed\": " << nevtALL << "," << std::endl << "\"EvtsPerSec[Rnd+Rmb+ME](123)\": \"" << std::to_string( nevtALL / ( sumgtim + sumrtim + sumwtim ) ) << " sec^-1\"," << std::endl diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/epoch_process_id.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/epoch_process_id.h index fd78e0cce4..f52a584d14 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/epoch_process_id.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/epoch_process_id.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Oct 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #ifndef EPOCH_PROCESS_ID_H #define EPOCH_PROCESS_ID_H 1 @@ -6,6 +11,6 @@ #define MG_EPOCH_PROCESS_ID SIGMA_MSSM_SLHA2_GG_TTX // For simplicity, define here the name of the process-dependent reference file for tests -#define MG_EPOCH_REFERENCE_FILE_NAME "../../../../../test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt" +#define MG_EPOCH_REFERENCE_FILE_NAME "../../test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt" #endif // EPOCH_PROCESS_ID_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f index c0bbf580ef..772339d0ac 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/fcheck_sa.f @@ -1,3 +1,8 @@ +C Copyright (C) 2020-2023 CERN and UCLouvain. +C Licensed under the GNU Lesser General Public License (version 3 or later). +C Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. +C Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + PROGRAM FCHECK_SA IMPLICIT NONE INCLUDE 'fsampler.inc' @@ -14,8 +19,6 @@ PROGRAM FCHECK_SA DOUBLE PRECISION GS(NEVTMAX) DOUBLE PRECISION RNDHEL(NEVTMAX) ! not yet used DOUBLE PRECISION RNDCOL(NEVTMAX) ! not yet used - INTEGER*4 CHANID - PARAMETER(CHANID=0) ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 DOUBLE PRECISION MES(NEVTMAX) INTEGER*4 SELHEL(NEVTMAX) ! not yet used INTEGER*4 SELCOL(NEVTMAX) ! not yet used @@ -59,8 +62,8 @@ PROGRAM FCHECK_SA DO IEVT = 1, NEVT GS(IEVT) = 1.2177157847767195 ! fixed G for aS=0.118 (hardcoded for now in check_sa.cc, fcheck_sa.f, runTest.cc) END DO - CALL FBRIDGESEQUENCE(BRIDGE, MOMENTA, GS, - & RNDHEL, RNDCOL, CHANID, MES, SELHEL, SELCOL) + CALL FBRIDGESEQUENCE_NOMULTICHANNEL(BRIDGE, MOMENTA, GS, ! TEMPORARY? disable multi-channel in fcheck.exe and fgcheck.exe #466 + & RNDHEL, RNDCOL, MES, SELHEL, SELCOL) DO IEVT = 1, NEVT c DO IEXTERNAL = 1, NEXTERNAL c WRITE(6,*) 'MOMENTA', IEVT, IEXTERNAL, diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gBridgeKernels.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gBridgeKernels.cu deleted file mode 120000 index 12c1d49d13..0000000000 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gBridgeKernels.cu +++ /dev/null @@ -1 +0,0 @@ -BridgeKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCPPProcess.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCPPProcess.cu deleted file mode 120000 index 1fc8661d4e..0000000000 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCPPProcess.cu +++ /dev/null @@ -1 +0,0 @@ -CPPProcess.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCrossSectionKernels.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCrossSectionKernels.cu deleted file mode 120000 index 9a05a7b55a..0000000000 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gCrossSectionKernels.cu +++ /dev/null @@ -1 +0,0 @@ -CrossSectionKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gMatrixElementKernels.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gMatrixElementKernels.cu deleted file mode 120000 index 82415576cc..0000000000 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gMatrixElementKernels.cu +++ /dev/null @@ -1 +0,0 @@ -MatrixElementKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRamboSamplingKernels.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRamboSamplingKernels.cu deleted file mode 120000 index 8dbfaa6493..0000000000 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRamboSamplingKernels.cu +++ /dev/null @@ -1 +0,0 @@ -RamboSamplingKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRandomNumberKernels.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRandomNumberKernels.cu deleted file mode 120000 index 26580cf106..0000000000 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gRandomNumberKernels.cu +++ /dev/null @@ -1 +0,0 @@ -RandomNumberKernels.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gcheck_sa.cu b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gcheck_sa.cu deleted file mode 120000 index b99171c25e..0000000000 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/gcheck_sa.cu +++ /dev/null @@ -1 +0,0 @@ -check_sa.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.cc index ed2e042427..79abbcc4f8 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,6 +1,11 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -9,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -130,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -142,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -166,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.h index f40433af4a..7c214cd74b 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RamboSamplingKernels.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -5,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -88,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.cc deleted file mode 100644 index eb8bc09ea9..0000000000 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.cc +++ /dev/null @@ -1,149 +0,0 @@ -#include "RandomNumberKernels.h" - -#include "CommonRandomNumbers.h" -#include "CudaRuntime.h" -#include "MemoryBuffers.h" - -#include - -#ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ -#define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } -inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) -{ - if ( code != CURAND_STATUS_SUCCESS ) - { - printf( "CurandAssert: %s:%d code=%d\n", file, line, code ); - if ( abort ) assert( code == CURAND_STATUS_SUCCESS ); - } -} -#endif /* clang-format on */ - -#ifdef __CUDACC__ -namespace mg5amcGpu -#else -namespace mg5amcCpu -#endif -{ - //-------------------------------------------------------------------------- - - CommonRandomNumberKernel::CommonRandomNumberKernel( BufferRndNumMomenta& rnarray ) - : RandomNumberKernelBase( rnarray ) - , m_seed( 20211220 ) - { - if( m_rnarray.isOnDevice() ) - throw std::runtime_error( "CommonRandomNumberKernel on host with a device random number array" ); - } - - //-------------------------------------------------------------------------- - - void CommonRandomNumberKernel::generateRnarray() - { - std::vector rnd = CommonRandomNumbers::generate( m_rnarray.size(), m_seed ); // NB: generate as double (HARDCODED) - std::copy( rnd.begin(), rnd.end(), m_rnarray.data() ); // NB: copy may imply a double-to-float conversion - } - - //-------------------------------------------------------------------------- - -#ifndef MGONGPU_HAS_NO_CURAND - CurandRandomNumberKernel::CurandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) - : RandomNumberKernelBase( rnarray ) - , m_isOnDevice( onDevice ) - { - if( m_isOnDevice ) - { -#ifdef __CUDACC__ - if( !m_rnarray.isOnDevice() ) - throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); -#else - throw std::runtime_error( "CurandRandomNumberKernel does not support CurandDevice on CPU host" ); -#endif - } - else - { - if( m_rnarray.isOnDevice() ) - throw std::runtime_error( "CurandRandomNumberKernel on host with a device random number array" ); - } - createGenerator(); - } - - //-------------------------------------------------------------------------- - - CurandRandomNumberKernel::~CurandRandomNumberKernel() - { - destroyGenerator(); - } - - //-------------------------------------------------------------------------- - - void CurandRandomNumberKernel::seedGenerator( const unsigned int seed ) - { - if( m_isOnDevice ) - { - destroyGenerator(); // workaround for #429 - createGenerator(); // workaround for #429 - } - //printf( "seedGenerator: seed %d\n", seed ); - checkCurand( curandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); - } - - //-------------------------------------------------------------------------- - - void CurandRandomNumberKernel::createGenerator() - { - // [NB Timings are for GenRnGen host|device (cpp|cuda) generation of 256*32*1 events with nproc=1: rn(0) is host=0.0012s] - const curandRngType_t type = CURAND_RNG_PSEUDO_MTGP32; // 0.00082s | 0.00064s (FOR FAST TESTS) - //const curandRngType_t type = CURAND_RNG_PSEUDO_XORWOW; // 0.049s | 0.0016s - //const curandRngType_t type = CURAND_RNG_PSEUDO_MRG32K3A; // 0.71s | 0.0012s (better but slower, especially in c++) - //const curandRngType_t type = CURAND_RNG_PSEUDO_MT19937; // 21s | 0.021s - //const curandRngType_t type = CURAND_RNG_PSEUDO_PHILOX4_32_10; // 0.024s | 0.00026s (used to segfault?) - if( m_isOnDevice ) - { - checkCurand( curandCreateGenerator( &m_rnGen, type ) ); - } - else - { - checkCurand( curandCreateGeneratorHost( &m_rnGen, type ) ); - } - //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_LEGACY ) ); // fails with code=104 (see #429) - checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_BEST ) ); - //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_DYNAMIC ) ); // fails with code=104 (see #429) - //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_SEEDED ) ); // fails with code=104 (see #429) - } - - //-------------------------------------------------------------------------- - - void CurandRandomNumberKernel::destroyGenerator() - { - checkCurand( curandDestroyGenerator( m_rnGen ) ); - } - - //-------------------------------------------------------------------------- - - void CurandRandomNumberKernel::generateRnarray() - { -#if defined MGONGPU_FPTYPE_DOUBLE - checkCurand( curandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); -#elif defined MGONGPU_FPTYPE_FLOAT - checkCurand( curandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); -#endif - /* - printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); - fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ - if( m_rnarray.isOnDevice() ) - { - data = new fptype[m_rnarray.size()](); - checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); - } -#endif - for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) - printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ - if( m_rnarray.isOnDevice() ) delete[] data; -#endif - */ - } - - //-------------------------------------------------------------------------- -#endif -} diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.h index 4d55f3d449..7ed728a26c 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/RandomNumberKernels.h @@ -1,16 +1,23 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2024) for the MG5aMC CUDACPP plugin. + #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined -#ifndef MGONGPU_HAS_NO_CURAND -#include "curand.h" -#endif - #include "MemoryBuffers.h" -#ifdef __CUDACC__ +// Forward definition from curand.h (the full header is only needed in CurandRandomKernel.cc) +struct curandGenerator_st; + +// Forward definition from hiprand.h (the full header is only needed in HiprandRandomKernel.cc) +struct rocrand_generator_base_type; +typedef rocrand_generator_base_type hiprandGenerator_st; + +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -101,7 +108,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPU_HAS_NO_CURAND // A class encapsulating CURAND random number generation on a CPU host or on a GPU device class CurandRandomNumberKernel final : public RandomNumberKernelBase { @@ -136,10 +142,49 @@ namespace mg5amcCpu const bool m_isOnDevice; // The curand generator - curandGenerator_t m_rnGen; + // (NB: curand.h defines typedef curandGenerator_t as a pointer to forward-defined 'struct curandGenerator_st') + curandGenerator_st* m_rnGen; }; -#endif + //-------------------------------------------------------------------------- + + // A class encapsulating HIPRAND random number generation on a CPU host or on a GPU device + class HiprandRandomNumberKernel final : public RandomNumberKernelBase + { + public: + + // Constructor from an existing output buffer + HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ); + + // Destructor + ~HiprandRandomNumberKernel(); + + // Seed the random number generator + void seedGenerator( const unsigned int seed ) override final; + + // Generate the random number array + void generateRnarray() override final; + + // Is this a host or device kernel? + bool isOnDevice() const override final { return m_isOnDevice; } + + private: + + // Create the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void createGenerator(); + + // Destroy the generator (workaround for #429: do this in every seedGenerator call rather than only in the ctor) + void destroyGenerator(); + + private: + + // Is this a host or device kernel? + const bool m_isOnDevice; + + // The hiprand generator + // (NB: hiprand.h defines typedef hiprandGenerator_t as a pointer to forward-defined 'struct hiprandGenerator_st') + hiprandGenerator_st* m_rnGen; + }; //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index 2155495366..3ad91dfd59 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -1,8 +1,16 @@ +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) -#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories +#=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts +#=== NB: use 'override' to ensure that the value can not be modified from the outside +override CUDACPP_MAKEFILE := $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)) +###$(info CUDACPP_MAKEFILE='$(CUDACPP_MAKEFILE)') -CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)) -CUDACPP_SRC_MAKEFILE = cudacpp_src.mk +#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories +override CUDACPP_SRC_MAKEFILE = cudacpp_src.mk #------------------------------------------------------------------------------- @@ -24,32 +32,78 @@ UNAME_P := $(shell uname -p) #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Include the common MG5aMC Makefile options + +# OM: this is crucial for MG5aMC flag consistency/documentation +# AV: temporarely comment this out because it breaks cudacpp builds +ifneq ($(wildcard ../../Source/make_opts),) +include ../../Source/make_opts +endif + +#------------------------------------------------------------------------------- + +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) INCFLAGS += -I../../src -# Dependency on tools directory -TOOLSDIR = ../../../../../tools -INCFLAGS += -I$(TOOLSDIR) +# Compiler-specific googletest build directory (#125 and #738) +ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) +override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) +else ifneq ($(shell $(CXX) --version | egrep '^clang'),) +override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) +override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) +else +override CXXNAME = unknown +endif +###$(info CXXNAME=$(CXXNAME)) +override CXXNAMESUFFIX = _$(CXXNAME) +export CXXNAMESUFFIX # Dependency on test directory -TESTDIR = ../../../../../test -GTESTLIBDIR = $(TESTDIR)/googletest/build/lib/ -GTESTLIBS = $(GTESTLIBDIR)/libgtest.a $(GTESTLIBDIR)/libgtest_main.a +# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) +# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) +###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation +###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation +TESTDIRCOMMON = ../../../../../test +TESTDIRLOCAL = ../../test +ifneq ($(wildcard $(GTEST_ROOT)),) +TESTDIR = +else ifneq ($(LOCALGTEST),) +TESTDIR=$(TESTDIRLOCAL) +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) +TESTDIR = $(TESTDIRCOMMON) +GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) +else +TESTDIR = +endif +ifneq ($(GTEST_ROOT),) +GTESTLIBDIR = $(GTEST_ROOT)/lib64/ +GTESTLIBS = $(GTESTLIBDIR)/libgtest.a $(GTESTLIBDIR)/libgtest_main.a +GTESTINC = -I$(GTEST_ROOT)/include +else +GTESTLIBDIR = +GTESTLIBS = +GTESTINC = +endif +###$(info GTEST_ROOT = $(GTEST_ROOT)) +###$(info LOCALGTEST = $(LOCALGTEST)) +###$(info TESTDIR = $(TESTDIR)) #------------------------------------------------------------------------------- #=== Configure the C++ compiler -CXXFLAGS = $(OPTFLAGS) -std=c++17 $(INCFLAGS) $(USE_NVTX) -Wall -Wshadow -Wextra +CXXFLAGS = $(OPTFLAGS) -std=c++17 $(INCFLAGS) -Wall -Wshadow -Wextra ifeq ($(shell $(CXX) --version | grep ^nvc++),) -CXXFLAGS+= -ffast-math # see issue #117 +CXXFLAGS += -ffast-math # see issue #117 endif ###CXXFLAGS+= -Ofast # performance is not different from --fast-math ###CXXFLAGS+= -g # FOR DEBUGGING ONLY @@ -60,26 +114,52 @@ endif # Note: AR, CXX and FC are implicitly defined if not set externally # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) +CXXFLAGS += -mmacosx-version-min=11.3 +endif + #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -89,44 +169,84 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) - CUINC = -I$(CUDA_HOME)/include/ - CULIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - CUOPTFLAGS = -lineinfo - CUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + CUINC = -I$(CUDA_HOME)/include/ + CUOPTFLAGS = -lineinfo + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= - override CULIBFLAGS= -endif + override CUINC= + override HIPINC= -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -135,15 +255,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -159,27 +279,31 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) -else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) -override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578) +###else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) # AV for Mac (Apple clang compiler) +else ifeq ($(UNAME_S),Darwin) # OM for Mac (any compiler) +override OMPFLAGS = # AV disable OpenMP MT on Apple clang (builds fail in the CI #578) +###override OMPFLAGS = -fopenmp # OM reenable OpenMP MT on Apple clang? (AV Oct 2023: this still fails in the CI) else -override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT (default before #575) +override OMPFLAGS = -fopenmp # enable OpenMP MT by default on all other platforms +###override OMPFLAGS = # disable OpenMP MT on all other platforms (default before #575) endif # Set the default AVX (vectorization) choice @@ -223,24 +347,62 @@ ifeq ($(HRDCOD),) override HRDCOD = 0 endif -# Set the default RNDGEN (random number generator) choice -ifeq ($(NVCC),) - override RNDGEN = hasNoCurand -else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too +# Export AVX, FPTYPE, HELINL, HRDCOD, OMPFLAGS so that it is not necessary to pass them to the src Makefile too export AVX export FPTYPE export HELINL export HRDCOD -export RNDGEN export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Configure defaults and check if user-defined choices exist for RNDGEN (legacy!), HASCURAND, HASHIPRAND + +# If the legacy RNDGEN exists, this take precedence over any HASCURAND choice (but a warning is printed out) +###$(info RNDGEN=$(RNDGEN)) +ifneq ($(RNDGEN),) + $(warning Environment variable RNDGEN is no longer supported, please use HASCURAND instead!) + ifeq ($(RNDGEN),hasCurand) + override HASCURAND = $(RNDGEN) + else ifeq ($(RNDGEN),hasNoCurand) + override HASCURAND = $(RNDGEN) + else ifneq ($(RNDGEN),hasNoCurand) + $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported - but use HASCURAND instead!) + endif +endif + +# Set the default HASCURAND (curand random number generator) choice, if no prior choice exists for HASCURAND +# (NB: allow HASCURAND=hasCurand even if $(GPUCC) does not point to nvcc: assume CUDA_HOME was defined correctly...) +ifeq ($(HASCURAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASCURAND = hasNoCurand + else ifeq ($(findstring nvcc,$(GPUCC)),nvcc) # Nvidia GPU build + override HASCURAND = hasCurand + else # non-Nvidia GPU build + override HASCURAND = hasNoCurand + endif +endif + +# Set the default HASHIPRAND (hiprand random number generator) choice, if no prior choice exists for HASHIPRAND +# (NB: allow HASHIPRAND=hasHiprand even if $(GPUCC) does not point to hipcc: assume HIP_HOME was defined correctly...) +ifeq ($(HASHIPRAND),) + ifeq ($(GPUCC),) # CPU-only build + override HASHIPRAND = hasNoHiprand + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) # AMD GPU build + override HASHIPRAND = hasHiprand + else # non-AMD GPU build + override HASHIPRAND = hasNoHiprand + endif +endif + +# Export HASCURAND, HASHIPRAND so that it is not necessary to pass them to the src Makefile too +# (NB: these variables in cudacpp_src.mk are only used to define the build tag, they are NOT needed for RNDCXXFLAGS or RNDLIBFLAGS) +export HASCURAND +export HASHIPRAND + +#------------------------------------------------------------------------------- + +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -298,13 +460,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -313,7 +475,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -322,19 +484,40 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) + +#=== Set the CUDA/HIP/C++ compiler and linker flags appropriate to user-defined choices of HASCURAND, HASHIPRAND + +$(info HASCURAND=$(HASCURAND)) +$(info HASHIPRAND=$(HASHIPRAND)) +override RNDCXXFLAGS= +override RNDLIBFLAGS= + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASCURAND choice (example: "make HASCURAND=hasNoCurand") +ifeq ($(HASCURAND),hasNoCurand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_CURAND +else ifeq ($(HASCURAND),hasCurand) + override RNDLIBFLAGS += -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! +else + $(error Unknown HASCURAND='$(HASCURAND)': only 'hasCurand' and 'hasNoCurand' are supported) +endif + +# Set the RNDCXXFLAGS and RNDLIBFLAGS build flags appropriate to each HASHIPRAND choice (example: "make HASHIPRAND=hasNoHiprand") +ifeq ($(HASHIPRAND),hasNoHiprand) + override RNDCXXFLAGS += -DMGONGPU_HAS_NO_HIPRAND +else ifeq ($(HASHIPRAND),hasHiprand) + override RNDLIBFLAGS += -L$(HIP_HOME)/lib/ -lhiprand +else ifneq ($(HASHIPRAND),hasHiprand) + $(error Unknown HASHIPRAND='$(HASHIPRAND)': only 'hasHiprand' and 'hasNoHiprand' are supported) endif +#$(info RNDCXXFLAGS=$(RNDCXXFLAGS)) +#$(info HASHIPRAND=$(HASHIPRAND)) + #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -345,7 +528,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ifeq ($(USEBUILDDIR),1) @@ -372,11 +555,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -389,7 +572,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -399,11 +582,15 @@ endif testmain=$(BUILDDIR)/runTest.exe -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_main) $(cxx_main) $(testmain) $(fcu_main) $(fcxx_main) +ifneq ($(GTESTLIBS),) +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_main) $(cxx_main) $(fcu_main) $(fcxx_main) $(testmain) +else +all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_main) $(cxx_main) $(fcu_main) $(fcxx_main) +endif # Target (and build options): debug MAKEDEBUG= -debug: OPTFLAGS = -g -O0 -DDEBUG2 +debug: OPTFLAGS = -g -O0 debug: CUOPTFLAGS = -G debug: MAKEDEBUG := debug debug: all.$(TAG) @@ -415,35 +602,58 @@ $(BUILDDIR)/.build.$(TAG): @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi @touch $(BUILDDIR)/.build.$(TAG) -# Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) -$(BUILDDIR)/%.o : %.cu *.h ../../src/*.h +# Generic target and build rules: objects from CUDA or HIP compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) +$(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ -$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h +$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h +# (NB do not include CUINC here! add it only for NVTX or curand #679) +$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(CUINC) -fPIC -c $< -o $@ + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ -# Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117) +# Apply special build flags only to CrossSectionKernel[_cu].o (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) +$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math +endif endif + +# Apply special build flags only to check_sa[_cu].o (NVTX in timermap.h, #679) +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) +$(BUILDDIR)/check_sa_cu.o: CXXFLAGS += $(USE_NVTX) $(CUINC) + +# Apply special build flags only to check_sa[_cu].o and (Cu|Hip)randRandomNumberKernel[_cu].o +$(BUILDDIR)/check_sa.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/check_sa_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/CurandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(RNDCXXFLAGS) +$(BUILDDIR)/HiprandRandomNumberKernel_cu.o: CUFLAGS += $(RNDCXXFLAGS) +ifeq ($(HASCURAND),hasCurand) # curand headers, #679 +$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) +endif +ifeq ($(HASHIPRAND),hasHiprand) # hiprand headers +$(BUILDDIR)/HiprandRandomNumberKernel.o: CXXFLAGS += $(HIPINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -451,15 +661,15 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif -#### Apply special build flags only to CPPProcess.cc (-flto) +#### Apply special build flags only to CPPProcess.o (-flto) ###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto -#### Apply special build flags only to CPPProcess.cc (AVXFLAGS) +#### Apply special build flags only to CPPProcess.o (AVXFLAGS) ###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) #------------------------------------------------------------------------------- @@ -467,7 +677,7 @@ endif # Target (and build rules): common (src) library commonlib : $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so: ../../src/*.h ../../src/*.cc +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so: ../../src/*.h ../../src/*.cc $(BUILDDIR)/.build.$(TAG) $(MAKE) -C ../../src $(MAKEDEBUG) -f $(CUDACPP_SRC_MAKEFILE) #------------------------------------------------------------------------------- @@ -477,12 +687,12 @@ processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/RandomNumberKernels.o $(BUILDDIR)/RamboSamplingKernels.o +cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda -cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o -cu_objects_exe=$(BUILDDIR)/gRandomNumberKernels.o $(BUILDDIR)/gRamboSamplingKernels.o +cu_objects_lib=$(BUILDDIR)/CPPProcess_cu.o $(BUILDDIR)/MatrixElementKernels_cu.o $(BUILDDIR)/BridgeKernels_cu.o $(BUILDDIR)/CrossSectionKernels_cu.o +cu_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel_cu.o $(BUILDDIR)/RamboSamplingKernels_cu.o endif # Target (and build rules): C++ and CUDA shared libraries @@ -491,11 +701,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -509,19 +725,19 @@ endif # Target (and build rules): C++ and CUDA standalone executables $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) +$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o + $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(BUILDDIR)/HiprandRandomNumberKernel.o $(RNDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(CULIBFLAGS) +$(cu_main): $(BUILDDIR)/check_sa_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o + $(GPUCC) -o $@ $(BUILDDIR)/check_sa_cu.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel_cu.o $(BUILDDIR)/HiprandRandomNumberKernel_cu.o $(RNDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -545,61 +761,69 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(CULIBFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else + $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(CULIBFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- # Target (and build rules): test objects and test executable $(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx_cu.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx_cu.o $(testmain): cu_objects_exe += $(BUILDDIR)/testxxx_cu.o # Comment out this line to skip the CUDA test of xxx functions endif $(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc_cu.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o $(testmain): cu_objects_exe += $(BUILDDIR)/testmisc_cu.o # Comment out this line to skip the CUDA miscellaneous tests endif $(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) -$(BUILDDIR)/runTest_cu.o: INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -608,7 +832,7 @@ $(testmain): cu_objects_exe += $(BUILDDIR)/runTest_cu.o endif $(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += -I$(TESTDIR)/googletest/googletest/include +$(testmain): INCFLAGS += $(GTESTINC) $(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -lgtest_main ifneq ($(OMPFLAGS),) @@ -623,22 +847,37 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) $(CULIBFLAGS) + $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda $(CULIBFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif +endif + +# Use target gtestlibs to build only googletest +ifneq ($(GTESTLIBS),) +gtestlibs: $(GTESTLIBS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 $(GTESTLIBS): ifneq ($(shell which flock 2>/dev/null),) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR) else - $(MAKE) -C $(TESTDIR) + if [ -d $(TESTDIR) ]; then $(MAKE) -C $(TESTDIR); fi endif #------------------------------------------------------------------------------- @@ -699,9 +938,12 @@ cleanall: $(MAKE) USEBUILDDIR=0 -C ../../src cleanall -f $(CUDACPP_SRC_MAKEFILE) rm -rf build.* -# Target: clean the builds as well as the googletest installation +# Target: clean the builds as well as the gtest installation(s) distclean: cleanall - $(MAKE) -C $(TESTDIR) clean +ifneq ($(wildcard $(TESTDIRCOMMON)),) + $(MAKE) -C $(TESTDIRCOMMON) clean +endif + $(MAKE) -C $(TESTDIRLOCAL) clean #------------------------------------------------------------------------------- @@ -730,9 +972,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -751,7 +993,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc index 9c9287e0c5..27ce14277f 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.cc @@ -1,6 +1,11 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -17,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -41,14 +46,10 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif - // Create a process object, read parm card and set parameters - // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? - // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? - CPPProcess process( /*verbose=*/false ); - process.initProc( "../../Cards/param_card.dat" ); + // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran *ppbridge = new Bridge( *pnevtF, *pnparF, *pnp4F ); } @@ -64,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -95,17 +96,42 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) - pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); + pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol ); #else // Use the host/CPU implementation in the C++ library // (there is no device implementation in this library) - pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); + pbridge->cpu_sequence( momenta, gs, rndhel, rndcol, ( pchannelId ? *pchannelId : 0 ), mes, selhel, selcol ); #endif } + /** + * Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++, without multi-channel mode. + * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f). + * + * @param ppbridge the pointer to the Bridge pointer (the Bridge pointer is handled in Fortran as an INTEGER*8 variable) + * @param momenta the pointer to the input 4-momenta + * @param gs the pointer to the input Gs (running QCD coupling constant alphas) + * @param rndhel the pointer to the input random numbers for helicity selection + * @param rndcol the pointer to the input random numbers for color selection + * @param mes the pointer to the output matrix elements + * @param selhel the pointer to the output selected helicities + * @param selcol the pointer to the output selected colors + */ + void fbridgesequence_nomultichannel_( CppObjectInFortran** ppbridge, + const FORTRANFPTYPE* momenta, + const FORTRANFPTYPE* gs, + const FORTRANFPTYPE* rndhel, + const FORTRANFPTYPE* rndcol, + FORTRANFPTYPE* mes, + int* selhel, + int* selcol ) + { + fbridgesequence_( ppbridge, momenta, gs, rndhel, rndcol, nullptr, mes, selhel, selcol ); + } + /** * Retrieve the number of good helicities for helicity filtering in the Bridge. * This is a C symbol that should be called from the Fortran code (in auto_dsig1.f). diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc index f140b660fc..0c319d8e7c 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fbridge.inc @@ -1,3 +1,8 @@ +C Copyright (C) 2020-2023 CERN and UCLouvain. +C Licensed under the GNU Lesser General Public License (version 3 or later). +C Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. +C Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + C C Create a Bridge and return its pointer C - PBRIDGE: the memory address of the C++ Bridge @@ -31,7 +36,7 @@ C - MOMENTA: the input 4-momenta Fortran array C - GS: the input Gs (running QCD coupling constant alphas) Fortran array C - RNDHEL: the input random number Fortran array for helicity selection C - RNDCOL: the input random number Fortran array for color selection -C - CHANID: the input Feynman diagram to enhance in multi-channel mode if 1 to n (disable multi-channel if 0) +C - CHANID: the input Feynman diagram to enhance in multi-channel mode if 1 to n C - MES: the output matrix element Fortran array C - SELHEL: the output selected helicity Fortran array C - SELCOL: the output selected color Fortran array @@ -51,6 +56,31 @@ C END SUBROUTINE FBRIDGESEQUENCE END INTERFACE +C +C Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++. +C - PBRIDGE: the memory address of the C++ Bridge +C - MOMENTA: the input 4-momenta Fortran array +C - GS: the input Gs (running QCD coupling constant alphas) Fortran array +C - RNDHEL: the input random number Fortran array for helicity selection +C - RNDCOL: the input random number Fortran array for color selection +C - MES: the output matrix element Fortran array +C - SELHEL: the output selected helicity Fortran array +C - SELCOL: the output selected color Fortran array +C + INTERFACE + SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL(PBRIDGE, MOMENTA, GS, + & RNDHEL, RNDCOL, MES, SELHEL, SELCOL) + INTEGER*8 PBRIDGE + DOUBLE PRECISION MOMENTA(*) + DOUBLE PRECISION GS(*) + DOUBLE PRECISION RNDHEL(*) + DOUBLE PRECISION RNDCOL(*) + DOUBLE PRECISION MES(*) + INTEGER*4 SELHEL(*) + INTEGER*4 SELCOL(*) + END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL + END INTERFACE + C C Retrieve the number of good helicities for helicity filtering in the Bridge. C - PBRIDGE: the memory address of the C++ Bridge diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.cc index bc90937f47..3743934f41 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.cc @@ -1,13 +1,19 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #include "mgOnGpuConfig.h" #include "Bridge.h" +#include "CPPProcess.h" #include "MemoryBuffers.h" #include "RamboSamplingKernels.h" #include "RandomNumberKernels.h" //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -34,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -59,8 +65,8 @@ namespace mg5amcCpu , m_prnk( new CommonRandomNumberKernel( m_hstRndmom ) ) , m_prsk( new RamboSamplingKernelHost( energy, m_hstRndmom, m_hstMomenta, m_hstWeights, nevtF ) ) { - if( nparF != mgOnGpu::npar ) throw std::runtime_error( "Sampler constructor: npar mismatch" ); - if( np4F != mgOnGpu::np4 ) throw std::runtime_error( "Sampler constructor: np4 mismatch" ); + if( nparF != CPPProcess::npar ) throw std::runtime_error( "Sampler constructor: npar mismatch" ); + if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Sampler constructor: np4 mismatch" ); std::cout << "WARNING! Instantiate host Sampler (nevt=" << m_nevt << ")" << std::endl; } @@ -99,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.inc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.inc index d4895df206..5cc7134778 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.inc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/fsampler.inc @@ -1,3 +1,8 @@ +C Copyright (C) 2020-2023 CERN and UCLouvain. +C Licensed under the GNU Lesser General Public License (version 3 or later). +C Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. +C Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + C C Create a Sampler and return its pointer C - PSAMPLER: the memory address of the C++ Sampler diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/nvtx.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/nvtx.h index e206b8e075..84d63eee4f 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/nvtx.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/nvtx.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Aug 2020, based on earlier work by Peter Heywood) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + #ifndef MGONGPUNVTX_H #define MGONGPUNVTX_H 1 diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/ompnumthreads.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/ompnumthreads.h index 9f8dbbb7f9..b637288f99 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/ompnumthreads.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/ompnumthreads.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef OMPNUMTHREADS_H #define OMPNUMTHREADS_H 1 diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/perf.py b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/perf.py index 63f4c714a7..ef37a03fdd 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/perf.py +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/perf.py @@ -1,5 +1,10 @@ #!/usr/bin/env python3 +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Roiser (Apr 2020) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Roiser (2020-2023) for the MG5aMC CUDACPP plugin. + from optparse import OptionParser from datetime import datetime from mpl_toolkits.mplot3d import Axes3D # noqa: F401 diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/profile.sh b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/profile.sh old mode 100755 new mode 100644 index 1d60fa3542..12ad545c38 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/profile.sh +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/profile.sh @@ -1,5 +1,10 @@ #!/bin/bash +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. +# Further modified by: A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + usage(){ echo "Usage (GUI analysis): $0 -l label [-cc] [-p #blocks #threads #iterations]" echo "Usage (CL analysis): $0 -nogui [-p #blocks #threads #iterations]" diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc index a1cec39ced..7f248d29a4 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/runTest.cc @@ -1,3 +1,11 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +//---------------------------------------------------------------------------- +// Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests +//---------------------------------------------------------------------------- + #include "mgOnGpuConfig.h" #include "CPPProcess.h" @@ -10,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -19,15 +27,15 @@ using namespace mg5amcCpu; struct CUDA_CPU_TestBase : public TestDriverBase { static constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - static constexpr int np4 = mgOnGpu::np4; - static constexpr int npar = mgOnGpu::npar; + static constexpr int np4 = CPPProcess::np4; + static constexpr int npar = CPPProcess::npar; static_assert( gputhreads % neppM == 0, "ERROR! #threads/block should be a multiple of neppM" ); static_assert( gputhreads <= mgOnGpu::ntpbMAX, "ERROR! #threads/block should be <= ntpbMAX" ); CUDA_CPU_TestBase( const std::string& refFileName ) : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -61,8 +69,10 @@ struct CPUTest : public CUDA_CPU_TestBase , hstMatrixElements( nevt ) , hstSelHel( nevt ) , hstSelCol( nevt ) - , hstIsGoodHel( mgOnGpu::ncomb ) + , hstIsGoodHel( CPPProcess::ncomb ) { + // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? process.initProc( "../../Cards/param_card.dat" ); } @@ -109,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -118,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -163,7 +173,7 @@ struct CUDATest : public CUDA_CPU_TestBase , hstMatrixElements( nevt ) , hstSelHel( nevt ) , hstSelCol( nevt ) - , hstIsGoodHel( mgOnGpu::ncomb ) + , hstIsGoodHel( CPPProcess::ncomb ) , devRndmom( nevt ) , devMomenta( nevt ) , devGs( nevt ) @@ -173,8 +183,10 @@ struct CUDATest : public CUDA_CPU_TestBase , devMatrixElements( nevt ) , devSelHel( nevt ) , devSelCol( nevt ) - , devIsGoodHel( mgOnGpu::ncomb ) + , devIsGoodHel( CPPProcess::ncomb ) { + // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? + // FIXME: the CPPProcess should really be a singleton? process.initProc( "../../Cards/param_card.dat" ); } @@ -226,7 +238,7 @@ struct CUDATest : public CUDA_CPU_TestBase return MemoryAccessMatrixElements::ieventAccessConst( hstMatrixElements.data(), ievt ); } }; -#endif +#endif /* clang-format off */ // Use two levels of macros to force stringification at the right level // (see https://gcc.gnu.org/onlinedocs/gcc-3.0.1/cpp_3.html#SEC17 and https://stackoverflow.com/a/3419392) @@ -244,8 +256,8 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); -#endif +#endif /* clang-format on */ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc index 5fa8ac70fe..ac0b049e60 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc @@ -1,4 +1,10 @@ -// Use ./runTest.exe --gtest_filter=*misc to run only this test +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +//---------------------------------------------------------------------------- +// Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests +//---------------------------------------------------------------------------- #include "mgOnGpuConfig.h" @@ -11,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -19,33 +25,48 @@ #define XTESTID( s ) TESTID( s ) +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ #ifdef MGONGPU_CPPSIMD /* clang-format off */ -bool maskand( const bool_v& mask ){ bool out = true; for ( int i=0; i TFTF (1010) + bool_sv mask1_sv = ( channelids1_sv % 2 == 0 ); // even channels 1234 -> FTFT (0101) + constexpr fptype_sv fpZERO_sv{}; // 0000 + //fptype_sv numerators0_sv = mask0_sv * absamp0_sv; // invalid operands to binary * ('__vector(4) long int' and '__vector(4) double') + fptype_sv numerators0_sv = fpternary( mask0_sv, absamp0_sv, fpZERO_sv ); // equivalent to "mask0_sv * absamp0_sv" + fptype_sv numerators1_sv = fpternary( mask1_sv, absamp1_sv, fpZERO_sv ); // equivalent to "mask1_sv * absamp1_sv" +#ifdef MGONGPU_CPPSIMD + //std::cout << "numerators0_sv: " << numerators0_sv << std::endl; + //std::cout << "numerators1_sv: " << numerators1_sv << std::endl; + for( int i = 0; i < neppV; i++ ) + { + // Values of numerators0_sv: 10.*1 11.*0 12.*1 13.*0 + if( channelids0_sv[i] % 2 == 0 ) // even channels + EXPECT_TRUE( numerators0_sv[i] == ( 10. + i ) ); + else // odd channels + EXPECT_TRUE( numerators0_sv[i] == 0. ); + // Values of numerators1_sv: 11.*0 12.*1 13.*0 14.*1 + if( channelids1_sv[i] % 2 == 0 ) // even channels + EXPECT_TRUE( numerators1_sv[i] == ( 11. + i ) ); + else // odd channels + EXPECT_TRUE( numerators1_sv[i] == 0. ); + } +#else + // Values of numerators0_sv: 10.*1 + EXPECT_TRUE( numerators0_sv == 10. ); + // Values of numerators1_sv: 11.*0 + EXPECT_TRUE( numerators1_sv == 0. ); +#endif + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx.cc index 1052022dd8..d4bb7022d9 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx.cc @@ -1,3 +1,11 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +//---------------------------------------------------------------------------- +// Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests +//---------------------------------------------------------------------------- + #include "mgOnGpuConfig.h" #include "CPPProcess.h" @@ -11,11 +19,12 @@ #include #include +#include // debug #701 (see https://stackoverflow.com/a/17473528) #include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -23,43 +32,91 @@ #define XTESTID( s ) TESTID( s ) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + std::string FPEhandlerMessage = "unknown"; + int FPEhandlerIevt = -1; + inline void FPEhandler( int sig ) + { +#ifdef MGONGPUCPP_GPUIMPL + std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; +#else + std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; +#endif + exit( 1 ); + } +} + TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { +#ifdef MGONGPUCPP_GPUIMPL + using namespace mg5amcGpu; +#else + using namespace mg5amcCpu; +#endif +#ifndef __APPLE__ // test #701 (except on MacOS where feenableexcept is not defined #730) + const char* enableFPEc = getenv( "CUDACPP_RUNTIME_ENABLEFPE" ); + const bool enableFPE = ( enableFPEc != 0 ) && ( std::string( enableFPEc ) != "" ); + if( enableFPE ) + { + feenableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701 + signal( SIGFPE, FPEhandler ); + } +#endif constexpr bool dumpEvents = false; // dump the expected output of the test? constexpr bool testEvents = !dumpEvents; // run the test? constexpr fptype toleranceXXXs = std::is_same::value ? 1.E-15 : 1.E-5; // Constant parameters constexpr int neppM = MemoryAccessMomenta::neppM; // AOSOA layout - using mgOnGpu::neppV; - using mgOnGpu::np4; - using mgOnGpu::npar; - const int nevt = 16; // 12 independent tests plus 4 duplicates (need a multiple of 8 for floats or for '512z') + constexpr int np4 = CPPProcess::np4; + const int nevt = 32; // 12 independent tests plus 20 duplicates (need a multiple of 16 for floats '512z') assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #endif /* clang-format off */ + // NB NEW TESTS FOR DEBUGGING #701: KEEP TWO SEPARATE SETS (16-SIMD-VECTORS!) OF TESTS FOR M==0 AND M!=0! const fptype par0[np4 * nevt] = // AOS[nevt][np4] { - 500, 0, 0, 500, // #0 (m=0 pT=0 E=pz>0) - 500, 0, 0, -500, // #1 (m=0 pT=0 -E=pz<0) - 500, 300, 400, 0, // #2 (m=0 pT>0 pz=0) - 500, 180, 240, 400, // #3 (m=0 pT>0 pz>0) - 500, 180, 240, -400, // #4 (m=0 pT>0 pz<0) - 500, 0, 0, 0, // #5 (m=50>0 pT=0 pz=0) - 500, 0, 0, 300, // #6 (m=40>0 pT=0 pz>0) - 500, 0, 0, -300, // #7 (m=40>0 pT=0 pz<0) - 500, 180, 240, 0, // #8 (m=40>0 pT>0 pz=0) - 500, -240, -180, 0, // #9 (m=40>0 pT>0 pz=0) - 500, 180, 192, 144, // #10 (m=40>0 pT>0 pz>0) - 500, 180, 192, -144, // #11 (m=40>0 pT>0 pz<0) - 500, 0, 0, 500, // DUPLICATE #12 == #0 (m=0 pT=0 E=pz>0) - 500, 0, 0, -500, // DUPLICATE #13 == #1 (m=0 pT=0 -E=pz<0) - 500, 300, 400, 0, // DUPLICATE #14 == #2 (m=0 pT>0 pz=0) - 500, 180, 240, 400 // DUPLICATE #15 == #3 (m=0 pT>0 pz>0) + 500, 0, 0, 500, // #0 (m=0 pT=0 E=pz>0) + 500, 0, 0, -500, // #1 (m=0 pT=0 -E=pz<0) + 500, 300, 400, 0, // #2 (m=0 pT>0 pz=0) + 500, 180, 240, 400, // #3 (m=0 pT>0 pz>0) + 500, 180, 240, -400, // #4 (m=0 pT>0 pz<0) + 500, 0, 0, 500, // #5 DUPLICATE == #0 (m=0 pT=0 E=pz>0) + 500, 0, 0, -500, // #6 DUPLICATE == #1 (m=0 pT=0 -E=pz<0) + 500, 300, 400, 0, // #7 DUPLICATE == #2 (m=0 pT>0 pz=0) + 500, 180, 240, 400, // #8 DUPLICATE == #3 (m=0 pT>0 pz>0) + 500, 180, 240, -400, // #9 DUPLICATE == #4 (m=0 pT>0 pz<0) + 500, 0, 0, 500, // #10 DUPLICATE == #0 (m=0 pT=0 E=pz>0) + 500, 0, 0, -500, // #11 DUPLICATE == #1 (m=0 pT=0 -E=pz<0) + 500, 300, 400, 0, // #12 DUPLICATE == #2 (m=0 pT>0 pz=0) + 500, 180, 240, 400, // #13 DUPLICATE == #3 (m=0 pT>0 pz>0) + 500, 180, 240, -400, // #14 DUPLICATE == #4 (m=0 pT>0 pz<0) + 500, 0, 0, 500, // #15 DUPLICATE == #0 (m=0 pT=0 E=pz>0) + 500, 0, 0, 0, // #16 (m=50>0 pT=0 pz=0) + 500, 0, 0, 300, // #17 (m=40>0 pT=0 pz>0) + 500, 0, 0, -300, // #18 (m=40>0 pT=0 pz<0) + 500, 180, 240, 0, // #19 (m=40>0 pT>0 pz=0) + 500, -240, -180, 0, // #20 (m=40>0 pT>0 pz=0) + 500, 180, 192, 144, // #21 (m=40>0 pT>0 pz>0) + 500, 180, 192, -144, // #22 (m=40>0 pT>0 pz<0) + 500, 0, 0, 0, // #23 DUPLICATE == #16 (m=50>0 pT=0 pz=0) + 500, 0, 0, 300, // #24 DUPLICATE == #17 (m=40>0 pT=0 pz>0) + 500, 0, 0, -300, // #25 DUPLICATE == #18 (m=40>0 pT=0 pz<0) + 500, 180, 240, 0, // #26 DUPLICATE == #19 (m=40>0 pT>0 pz=0) + 500, -240, -180, 0, // #27 DUPLICATE == #20 (m=40>0 pT>0 pz=0) + 500, 180, 192, 144, // #28 DUPLICATE == #21 (m=40>0 pT>0 pz>0) + 500, 180, 192, -144, // #29 DUPLICATE == #22 (m=40>0 pT>0 pz<0) + 500, 0, 0, 0, // #30 DUPLICATE == #16 (m=50>0 pT=0 pz=0) + 500, 0, 0, 300 // #31 DUPLICATE == #17 (m=40>0 pT=0 pz>0) }; /* clang-format on */ // Array initialization: zero-out as "{0}" (C and C++) or as "{}" (C++ only) // See https://en.cppreference.com/w/c/language/array_initialization#Notes @@ -73,7 +130,11 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype p1 = par0[ievt * np4 + 1]; const fptype p2 = par0[ievt * np4 + 2]; const fptype p3 = par0[ievt * np4 + 3]; - mass0[ievt] = sqrt( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3 ); + volatile fptype m2 = fpmax( p0 * p0 - p1 * p1 - p2 * p2 - p3 * p3, 0 ); // see #736 + if( m2 > 0 ) + mass0[ievt] = fpsqrt( (fptype)m2 ); + else + mass0[ievt] = 0; ispzgt0[ievt] = ( p3 > 0 ); ispzlt0[ievt] = ( p3 < 0 ); isptgt0[ievt] = ( p1 != 0 ) || ( p2 != 0 ); @@ -92,10 +153,18 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) std::string dumpFileName = "testxxx_cc_ref.txt.new"; // Compute the output wavefunctions // Dump new reference file if requested - using mgOnGpu::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - int itest = 0; // index on the expected output vector + constexpr int nw6 = CPPProcess::nw6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) + int itest = 0; // index on the expected output vector std::ofstream dumpFile; - if( dumpEvents ) dumpFile.open( dumpFileName, std::ios::trunc ); + if( dumpEvents ) + { + dumpFile.open( dumpFileName, std::ios::trunc ); + dumpFile << " // Copyright (C) 2020-2023 CERN and UCLouvain." << std::endl + << " // Licensed under the GNU Lesser General Public License (version 3 or later)." << std::endl + << " // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin." << std::endl + << " // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin." << std::endl; + } + // Lambda function for dumping wavefunctions auto dumpwf6 = [&]( std::ostream& out, const cxtype_sv wf[6], const char* xxx, int ievt, int nsp, fptype mass ) { out << std::setprecision( 15 ) << std::scientific; @@ -125,6 +194,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) } out << std::defaultfloat; }; + // Lambda function for testing wavefunctions (1) auto testwf6 = [&]( const cxtype_sv wf[6], const char* xxx, int ievt, int nsp, fptype mass ) { if( dumpEvents ) dumpwf6( dumpFile, wf, xxx, ievt, nsp, mass ); @@ -166,6 +236,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) } itest++; }; + // Lambda function for testing wavefunctions (2) auto testwf6two = [&]( const cxtype_sv wf[6], const cxtype_sv expwf[6], const char* xxx, int ievt ) { if( testEvents ) @@ -209,6 +280,32 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) } } }; + // Lambda function for resetting hstMomenta to the values of par0 + // This is needed in each test because hstMomenta may have been modified to ensure a function like ipzxxx can be used (#701) + auto resetHstMomentaToPar0 = [&]() + { + for( int ievt = 0; ievt < nevt; ievt++ ) + for( int ip4 = 0; ip4 < np4; ip4++ ) + MemoryAccessMomenta::ieventAccessIp4Ipar( hstMomenta.data(), ievt, ip4, ipar0 ) = par0[ievt * np4 + ip4]; // AOS to AOSOA + }; + // Lambda function for preparing the test of one specific function + const bool debug = false; + auto prepareTest = [&]( const char* xxx, int ievt ) + { + if( debug ) std::cout << "Prepare test " << xxx << " ievt=" << ievt << std::endl; + resetHstMomentaToPar0(); + FPEhandlerMessage = xxx; + FPEhandlerIevt = ievt; + if( std::string( xxx ) == "ipzxxx" || std::string( xxx ) == "opzxxx" || std::string( xxx ) == "imzxxx" || std::string( xxx ) == "omzxxx" || std::string( xxx ) == "ixzxxx" || std::string( xxx ) == "oxzxxx" ) + { + // Modify hstMomenta so that ALL events have the momenta of a single ievt + // This ensures that a function like ipzxxx (which assumes pZ>0) can be used without triggering FPEs (#701) + // This is done by filling the full SIMD vector with the value of ievt, which was already tested to respect the relevant assumptions + for( int jevt = 0; jevt < nevt; jevt++ ) + for( int ip4 = 0; ip4 < np4; ip4++ ) + MemoryAccessMomenta::ieventAccessIp4Ipar( hstMomenta.data(), jevt, ip4, ipar0 ) = par0[ievt * np4 + ip4]; // AOS to AOSOA + } + }; // Array initialization: zero-out as "{0}" (C and C++) or as "{}" (C++ only) // See https://en.cppreference.com/w/c/language/array_initialization#Notes cxtype_sv outwfI[6] = {}; // last result of ixxxxx (mass==0) @@ -220,18 +317,20 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) fptype* fp_outwf = reinterpret_cast( outwf ); // proof of concept for using fptype* in the interface fptype* fp_outwf3 = reinterpret_cast( outwf3 ); // proof of concept for using fptype* in the interface const int nhel = 1; + // *** START OF TESTING LOOP for( auto nsp: { -1, +1 } ) // antifermion/fermion (or initial/final for scalar and vector) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; #endif - if( false ) + if( debug ) { std::cout << std::endl; + std::cout << "nsp=" << nsp << " ievt=" << ievt << ": "; for( int ip4 = 0; ip4 < np4; ip4++ ) std::cout << par0[ievt * np4 + ip4] << ", "; std::cout << std::endl; } @@ -239,6 +338,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) const fptype* ievt0Momenta = MemoryAccessMomenta::ieventAccessRecordConst( hstMomenta.data(), ipagV * neppV ); // Test ixxxxx - NO ASSUMPTIONS { + prepareTest( "ixxxxx", ievt ); const fptype fmass = mass0[ievt]; ixxxxx( ievt0Momenta, fmass, nhel, nsp, fp_outwfI, ipar0 ); testwf6( outwfI, "ixxxxx", ievt, nsp, fmass ); @@ -248,6 +348,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) // Test ipzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0) if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzgt0[ievt] ) { + prepareTest( "ipzxxx", ievt ); ipzxxx( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 ); testwf6two( outwf, outwfI, "ipzxxx", ievt ); testwf6( outwf, "ipzxxx", ievt, nsp, 0 ); @@ -255,6 +356,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) // Test imzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0) if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzlt0[ievt] ) { + prepareTest( "imzxxx", ievt ); imzxxx( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 ); testwf6two( outwf, outwfI, "imzxxx", ievt ); testwf6( outwf, "imzxxx", ievt, nsp, 0 ); @@ -262,12 +364,14 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) // Test ixzxxx - ASSUMPTIONS: (FMASS == 0) and (PT > 0) if( mass0[ievt] == 0 && isptgt0[ievt] ) { + prepareTest( "ixzxxx", ievt ); ixzxxx( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 ); testwf6two( outwf, outwfI, "ixzxxx", ievt ); testwf6( outwf, "ixzxxx", ievt, nsp, 0 ); } // Test vxxxxx - NO ASSUMPTIONS { + prepareTest( "vxxxxx", ievt ); const fptype vmass = mass0[ievt]; vxxxxx( ievt0Momenta, vmass, nhel, nsp, fp_outwf, ipar0 ); testwf6( outwf, "vxxxxx", ievt, nsp, vmass ); @@ -276,6 +380,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) } // Test sxxxxx - NO ASSUMPTIONS { + prepareTest( "sxxxxx", ievt ); const fptype smass = mass0[ievt]; sxxxxx( ievt0Momenta, nsp, fp_outwf3, ipar0 ); // no mass, no helicity (was "smass>0") testwf6( outwf3, "sxxxxx", ievt, nsp, smass ); @@ -284,6 +389,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) } // Test oxxxxx - NO ASSUMPTIONS { + prepareTest( "oxxxxx", ievt ); const fptype fmass = mass0[ievt]; oxxxxx( ievt0Momenta, fmass, nhel, nsp, fp_outwfO, ipar0 ); testwf6( outwfO, "oxxxxx", ievt, nsp, fmass ); @@ -293,6 +399,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) // Test opzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == +PZ > 0) if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzgt0[ievt] ) { + prepareTest( "opzxxx", ievt ); opzxxx( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 ); testwf6two( outwf, outwfO, "opzxxx", ievt ); testwf6( outwf, "opzxxx", ievt, nsp, 0 ); @@ -300,6 +407,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) // Test omzxxx - ASSUMPTIONS: (FMASS == 0) and (PX == PY == 0 and E == -PZ > 0) if( mass0[ievt] == 0 && !isptgt0[ievt] && ispzlt0[ievt] ) { + prepareTest( "omzxxx", ievt ); omzxxx( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 ); testwf6two( outwf, outwfO, "omzxxx", ievt ); testwf6( outwf, "omzxxx", ievt, nsp, 0 ); @@ -307,17 +415,25 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) // Test oxzxxx - ASSUMPTIONS: (FMASS == 0) and (PT > 0) if( mass0[ievt] == 0 && isptgt0[ievt] ) { - oxzxxx( ievt0Momenta, nhel, nsp, reinterpret_cast( outwf ), ipar0 ); + prepareTest( "oxzxxx", ievt ); + oxzxxx( ievt0Momenta, nhel, nsp, fp_outwf, ipar0 ); testwf6two( outwf, outwfO, "oxzxxx", ievt ); testwf6( outwf, "oxzxxx", ievt, nsp, 0 ); } } } + // *** END OF TESTING LOOP if( dumpEvents ) { dumpFile.close(); std::cout << "INFO: New reference data dumped to file '" << dumpFileName << "'" << std::endl; } +#ifndef __APPLE__ // test #701 (except on MacOS where fedisableexcept is not defined #730) + if( enableFPE ) + { + fedisableexcept( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ); // debug #701 + } +#endif } //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx_cc_ref.txt b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx_cc_ref.txt index 8bc0384a68..637530d1f5 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx_cc_ref.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testxxx_cc_ref.txt @@ -1,14 +1,18 @@ + // Copyright (C) 2020-2023 CERN and UCLouvain. + // Licensed under the GNU Lesser General Public License (version 3 or later). + // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. + // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. expwfs.push_back( { // --------- 5.000000000000000e+02, 5.000000000000000e+02, // itest=0: ixxxxx#0 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=0: ixxxxx#0 nsp=-1 mass=0 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=0: ixxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=0: ixxxxx#0 nsp=-1 mass=0 -3.162277660168379e+01, 0.000000000000000e+00, // itest=0: ixxxxx#0 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=0: ixxxxx#0 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=0: ixxxxx#0 nsp=-1 mass=0 expwfs.push_back( { // --------- 5.000000000000000e+02, 5.000000000000000e+02, // itest=1: ixxxxx#0 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=1: ixxxxx#0 nsp=-1 mass=0 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=1: ixxxxx#0 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=1: ixxxxx#0 nsp=-1 mass=0 -3.162277660168379e+01, 0.000000000000000e+00, // itest=1: ixxxxx#0 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=1: ixxxxx#0 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=1: ixxxxx#0 nsp=-1 mass=0 @@ -78,8 +82,8 @@ expwfs.push_back( { // --------- 5.000000000000000e+02, -5.000000000000000e+02, // itest=11: ixxxxx#1 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=11: ixxxxx#1 nsp=-1 mass=0 - -3.162277660168379e+01, -0.000000000000000e+00, // itest=11: ixxxxx#1 nsp=-1 mass=0 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=11: ixxxxx#1 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=11: ixxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=11: ixxxxx#1 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=11: ixxxxx#1 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=11: ixxxxx#1 nsp=-1 mass=0 expwfs.push_back( { // --------- @@ -129,8 +133,8 @@ -0.000000000000000e+00, -0.000000000000000e+00, // itest=18: oxxxxx#1 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=18: oxxxxx#1 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=18: oxxxxx#1 nsp=-1 mass=0 - -3.162277660168379e+01, -0.000000000000000e+00, // itest=18: oxxxxx#1 nsp=-1 mass=0 - -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=18: oxxxxx#1 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=18: oxxxxx#1 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=18: oxxxxx#1 nsp=-1 mass=0 expwfs.push_back( { // --------- -5.000000000000000e+02, 5.000000000000000e+02, // itest=19: omzxxx#1 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=19: omzxxx#1 nsp=-1 mass=0 @@ -141,14 +145,14 @@ expwfs.push_back( { // --------- 5.000000000000000e+02, 0.000000000000000e+00, // itest=20: ixxxxx#2 nsp=-1 mass=0 3.000000000000000e+02, 4.000000000000000e+02, // itest=20: ixxxxx#2 nsp=-1 mass=0 - 1.341640786499874e+01, -1.788854381999832e+01, // itest=20: ixxxxx#2 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=20: ixxxxx#2 nsp=-1 mass=0 -2.236067977499790e+01, 0.000000000000000e+00, // itest=20: ixxxxx#2 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=20: ixxxxx#2 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=20: ixxxxx#2 nsp=-1 mass=0 expwfs.push_back( { // --------- 5.000000000000000e+02, 0.000000000000000e+00, // itest=21: ixxxxx#2 nsp=-1 mass=0 3.000000000000000e+02, 4.000000000000000e+02, // itest=21: ixxxxx#2 nsp=-1 mass=0 - 1.341640786499874e+01, -1.788854381999832e+01, // itest=21: ixxxxx#2 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=21: ixxxxx#2 nsp=-1 mass=0 -2.236067977499790e+01, 0.000000000000000e+00, // itest=21: ixxxxx#2 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=21: ixxxxx#2 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=21: ixxxxx#2 nsp=-1 mass=0 @@ -192,14 +196,14 @@ -3.000000000000000e+02, -4.000000000000000e+02, // itest=27: oxxxxx#2 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=27: oxxxxx#2 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=27: oxxxxx#2 nsp=-1 mass=0 - 1.341640786499874e+01, 1.788854381999832e+01, // itest=27: oxxxxx#2 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01, // itest=27: oxxxxx#2 nsp=-1 mass=0 -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=27: oxxxxx#2 nsp=-1 mass=0 expwfs.push_back( { // --------- -5.000000000000000e+02, -0.000000000000000e+00, // itest=28: oxxxxx#2 nsp=-1 mass=0 -3.000000000000000e+02, -4.000000000000000e+02, // itest=28: oxxxxx#2 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=28: oxxxxx#2 nsp=-1 mass=0 0.000000000000000e+00, 0.000000000000000e+00, // itest=28: oxxxxx#2 nsp=-1 mass=0 - 1.341640786499874e+01, 1.788854381999832e+01, // itest=28: oxxxxx#2 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01, // itest=28: oxxxxx#2 nsp=-1 mass=0 -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=28: oxxxxx#2 nsp=-1 mass=0 expwfs.push_back( { // --------- -5.000000000000000e+02, -0.000000000000000e+00, // itest=29: oxzxxx#2 nsp=-1 mass=0 @@ -349,1696 +353,3684 @@ 1.800000000000000e+01, 2.400000000000000e+01, // itest=49: oxzxxx#4 nsp=-1 mass=0 -1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=49: oxzxxx#4 nsp=-1 mass=0 expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=500 - -2.236067977499790e+01, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=500 - 2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=50: ixxxxx#5 nsp=-1 mass=500 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=-500 - -2.236067977499790e+01, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=-500 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=-500 - -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=51: ixxxxx#5 nsp=-1 mass=-500 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=52: vxxxxx#5 nsp=-1 mass=500 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=52: vxxxxx#5 nsp=-1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=52: vxxxxx#5 nsp=-1 mass=500 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=52: vxxxxx#5 nsp=-1 mass=500 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=52: vxxxxx#5 nsp=-1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=52: vxxxxx#5 nsp=-1 mass=500 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=53: vxxxxx#5 nsp=-1 mass=-500 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=53: vxxxxx#5 nsp=-1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=53: vxxxxx#5 nsp=-1 mass=-500 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=53: vxxxxx#5 nsp=-1 mass=-500 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=53: vxxxxx#5 nsp=-1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=53: vxxxxx#5 nsp=-1 mass=-500 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=54: sxxxxx#5 nsp=-1 mass=500 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=54: sxxxxx#5 nsp=-1 mass=500 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=54: sxxxxx#5 nsp=-1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=54: sxxxxx#5 nsp=-1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=54: sxxxxx#5 nsp=-1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=54: sxxxxx#5 nsp=-1 mass=500 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=-500 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=-500 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=55: sxxxxx#5 nsp=-1 mass=-500 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=56: oxxxxx#5 nsp=-1 mass=500 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=56: oxxxxx#5 nsp=-1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=56: oxxxxx#5 nsp=-1 mass=500 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=56: oxxxxx#5 nsp=-1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=56: oxxxxx#5 nsp=-1 mass=500 - -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=56: oxxxxx#5 nsp=-1 mass=500 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=-500 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=-500 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=-500 - -2.236067977499790e+01, 0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=-500 - -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=57: oxxxxx#5 nsp=-1 mass=-500 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 3.000000000000000e+02, // itest=58: ixxxxx#6 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=58: ixxxxx#6 nsp=-1 mass=400 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=58: ixxxxx#6 nsp=-1 mass=400 - -2.828427124746190e+01, -0.000000000000000e+00, // itest=58: ixxxxx#6 nsp=-1 mass=400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=58: ixxxxx#6 nsp=-1 mass=400 - 1.414213562373095e+01, 0.000000000000000e+00 } ); // itest=58: ixxxxx#6 nsp=-1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 3.000000000000000e+02, // itest=59: ixxxxx#6 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=59: ixxxxx#6 nsp=-1 mass=-400 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=59: ixxxxx#6 nsp=-1 mass=-400 - -2.828427124746190e+01, -0.000000000000000e+00, // itest=59: ixxxxx#6 nsp=-1 mass=-400 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=59: ixxxxx#6 nsp=-1 mass=-400 - -1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=59: ixxxxx#6 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -3.000000000000000e+02, // itest=60: vxxxxx#6 nsp=-1 mass=400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=60: vxxxxx#6 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=60: vxxxxx#6 nsp=-1 mass=400 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=60: vxxxxx#6 nsp=-1 mass=400 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=60: vxxxxx#6 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=60: vxxxxx#6 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -3.000000000000000e+02, // itest=61: vxxxxx#6 nsp=-1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=61: vxxxxx#6 nsp=-1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=61: vxxxxx#6 nsp=-1 mass=-400 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=61: vxxxxx#6 nsp=-1 mass=-400 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=61: vxxxxx#6 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=61: vxxxxx#6 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -3.000000000000000e+02, // itest=62: sxxxxx#6 nsp=-1 mass=400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=62: sxxxxx#6 nsp=-1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=62: sxxxxx#6 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=62: sxxxxx#6 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=62: sxxxxx#6 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=62: sxxxxx#6 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -3.000000000000000e+02, // itest=63: sxxxxx#6 nsp=-1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=63: sxxxxx#6 nsp=-1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=63: sxxxxx#6 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=63: sxxxxx#6 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=63: sxxxxx#6 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=63: sxxxxx#6 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -3.000000000000000e+02, // itest=64: oxxxxx#6 nsp=-1 mass=400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=64: oxxxxx#6 nsp=-1 mass=400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=64: oxxxxx#6 nsp=-1 mass=400 - 1.414213562373095e+01, 0.000000000000000e+00, // itest=64: oxxxxx#6 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=64: oxxxxx#6 nsp=-1 mass=400 - -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=64: oxxxxx#6 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -3.000000000000000e+02, // itest=65: oxxxxx#6 nsp=-1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=65: oxxxxx#6 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=65: oxxxxx#6 nsp=-1 mass=-400 - -1.414213562373095e+01, -0.000000000000000e+00, // itest=65: oxxxxx#6 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=65: oxxxxx#6 nsp=-1 mass=-400 - -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=65: oxxxxx#6 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -3.000000000000000e+02, // itest=66: ixxxxx#7 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=66: ixxxxx#7 nsp=-1 mass=400 - -2.828427124746190e+01, -0.000000000000000e+00, // itest=66: ixxxxx#7 nsp=-1 mass=400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=66: ixxxxx#7 nsp=-1 mass=400 - 1.414213562373095e+01, 0.000000000000000e+00, // itest=66: ixxxxx#7 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=66: ixxxxx#7 nsp=-1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -3.000000000000000e+02, // itest=67: ixxxxx#7 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=67: ixxxxx#7 nsp=-1 mass=-400 - -2.828427124746190e+01, -0.000000000000000e+00, // itest=67: ixxxxx#7 nsp=-1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=67: ixxxxx#7 nsp=-1 mass=-400 - -1.414213562373095e+01, -0.000000000000000e+00, // itest=67: ixxxxx#7 nsp=-1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=67: ixxxxx#7 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 3.000000000000000e+02, // itest=68: vxxxxx#7 nsp=-1 mass=400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=68: vxxxxx#7 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=68: vxxxxx#7 nsp=-1 mass=400 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=68: vxxxxx#7 nsp=-1 mass=400 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=68: vxxxxx#7 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=68: vxxxxx#7 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 3.000000000000000e+02, // itest=69: vxxxxx#7 nsp=-1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=69: vxxxxx#7 nsp=-1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=69: vxxxxx#7 nsp=-1 mass=-400 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=69: vxxxxx#7 nsp=-1 mass=-400 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=69: vxxxxx#7 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=69: vxxxxx#7 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 3.000000000000000e+02, // itest=70: sxxxxx#7 nsp=-1 mass=400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=70: sxxxxx#7 nsp=-1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=70: sxxxxx#7 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=70: sxxxxx#7 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=70: sxxxxx#7 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=70: sxxxxx#7 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 3.000000000000000e+02, // itest=71: sxxxxx#7 nsp=-1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=71: sxxxxx#7 nsp=-1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=71: sxxxxx#7 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=71: sxxxxx#7 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=71: sxxxxx#7 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=71: sxxxxx#7 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 3.000000000000000e+02, // itest=72: oxxxxx#7 nsp=-1 mass=400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=72: oxxxxx#7 nsp=-1 mass=400 - 1.414213562373095e+01, 0.000000000000000e+00, // itest=72: oxxxxx#7 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=72: oxxxxx#7 nsp=-1 mass=400 - -2.828427124746190e+01, -0.000000000000000e+00, // itest=72: oxxxxx#7 nsp=-1 mass=400 - -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=72: oxxxxx#7 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 3.000000000000000e+02, // itest=73: oxxxxx#7 nsp=-1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=73: oxxxxx#7 nsp=-1 mass=-400 - -1.414213562373095e+01, -0.000000000000000e+00, // itest=73: oxxxxx#7 nsp=-1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=73: oxxxxx#7 nsp=-1 mass=-400 - -2.828427124746190e+01, -0.000000000000000e+00, // itest=73: oxxxxx#7 nsp=-1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=73: oxxxxx#7 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=74: ixxxxx#8 nsp=-1 mass=400 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=74: ixxxxx#8 nsp=-1 mass=400 - 1.200000000000000e+01, -1.600000000000000e+01, // itest=74: ixxxxx#8 nsp=-1 mass=400 - -2.000000000000000e+01, -0.000000000000000e+00, // itest=74: ixxxxx#8 nsp=-1 mass=400 - -5.999999999999999e+00, 7.999999999999999e+00, // itest=74: ixxxxx#8 nsp=-1 mass=400 - 1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=74: ixxxxx#8 nsp=-1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=75: ixxxxx#8 nsp=-1 mass=-400 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=75: ixxxxx#8 nsp=-1 mass=-400 - 1.200000000000000e+01, -1.600000000000000e+01, // itest=75: ixxxxx#8 nsp=-1 mass=-400 - -2.000000000000000e+01, -0.000000000000000e+00, // itest=75: ixxxxx#8 nsp=-1 mass=-400 - 5.999999999999999e+00, -7.999999999999999e+00, // itest=75: ixxxxx#8 nsp=-1 mass=-400 - -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=75: ixxxxx#8 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=76: vxxxxx#8 nsp=-1 mass=400 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=76: vxxxxx#8 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=76: vxxxxx#8 nsp=-1 mass=400 - 0.000000000000000e+00, 5.656854249492381e-01, // itest=76: vxxxxx#8 nsp=-1 mass=400 - 0.000000000000000e+00, -4.242640687119285e-01, // itest=76: vxxxxx#8 nsp=-1 mass=400 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=76: vxxxxx#8 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=77: vxxxxx#8 nsp=-1 mass=-400 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=77: vxxxxx#8 nsp=-1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=77: vxxxxx#8 nsp=-1 mass=-400 - -0.000000000000000e+00, 5.656854249492381e-01, // itest=77: vxxxxx#8 nsp=-1 mass=-400 - -0.000000000000000e+00, -4.242640687119285e-01, // itest=77: vxxxxx#8 nsp=-1 mass=-400 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=77: vxxxxx#8 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=78: sxxxxx#8 nsp=-1 mass=400 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=78: sxxxxx#8 nsp=-1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=78: sxxxxx#8 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=78: sxxxxx#8 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=78: sxxxxx#8 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=78: sxxxxx#8 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=79: sxxxxx#8 nsp=-1 mass=-400 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=79: sxxxxx#8 nsp=-1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=79: sxxxxx#8 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=79: sxxxxx#8 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=79: sxxxxx#8 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=79: sxxxxx#8 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=80: oxxxxx#8 nsp=-1 mass=400 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=80: oxxxxx#8 nsp=-1 mass=400 - -5.999999999999999e+00, -7.999999999999999e+00, // itest=80: oxxxxx#8 nsp=-1 mass=400 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=80: oxxxxx#8 nsp=-1 mass=400 - 1.200000000000000e+01, 1.600000000000000e+01, // itest=80: oxxxxx#8 nsp=-1 mass=400 - -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=80: oxxxxx#8 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=81: oxxxxx#8 nsp=-1 mass=-400 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=81: oxxxxx#8 nsp=-1 mass=-400 - 5.999999999999999e+00, 7.999999999999999e+00, // itest=81: oxxxxx#8 nsp=-1 mass=-400 - -1.000000000000000e+01, -0.000000000000000e+00, // itest=81: oxxxxx#8 nsp=-1 mass=-400 - 1.200000000000000e+01, 1.600000000000000e+01, // itest=81: oxxxxx#8 nsp=-1 mass=-400 - -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=81: oxxxxx#8 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=82: ixxxxx#9 nsp=-1 mass=400 - -2.400000000000000e+02, -1.800000000000000e+02, // itest=82: ixxxxx#9 nsp=-1 mass=400 - -1.600000000000000e+01, 1.200000000000000e+01, // itest=82: ixxxxx#9 nsp=-1 mass=400 - -2.000000000000000e+01, -0.000000000000000e+00, // itest=82: ixxxxx#9 nsp=-1 mass=400 - 7.999999999999999e+00, -5.999999999999999e+00, // itest=82: ixxxxx#9 nsp=-1 mass=400 - 1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=82: ixxxxx#9 nsp=-1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=83: ixxxxx#9 nsp=-1 mass=-400 - -2.400000000000000e+02, -1.800000000000000e+02, // itest=83: ixxxxx#9 nsp=-1 mass=-400 - -1.600000000000000e+01, 1.200000000000000e+01, // itest=83: ixxxxx#9 nsp=-1 mass=-400 - -2.000000000000000e+01, -0.000000000000000e+00, // itest=83: ixxxxx#9 nsp=-1 mass=-400 - -7.999999999999999e+00, 5.999999999999999e+00, // itest=83: ixxxxx#9 nsp=-1 mass=-400 - -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=83: ixxxxx#9 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=84: vxxxxx#9 nsp=-1 mass=400 - 2.400000000000000e+02, 1.800000000000000e+02, // itest=84: vxxxxx#9 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=84: vxxxxx#9 nsp=-1 mass=400 - 0.000000000000000e+00, -4.242640687119285e-01, // itest=84: vxxxxx#9 nsp=-1 mass=400 - 0.000000000000000e+00, 5.656854249492381e-01, // itest=84: vxxxxx#9 nsp=-1 mass=400 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=84: vxxxxx#9 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=85: vxxxxx#9 nsp=-1 mass=-400 - 2.400000000000000e+02, 1.800000000000000e+02, // itest=85: vxxxxx#9 nsp=-1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=85: vxxxxx#9 nsp=-1 mass=-400 - 0.000000000000000e+00, -4.242640687119285e-01, // itest=85: vxxxxx#9 nsp=-1 mass=-400 - 0.000000000000000e+00, 5.656854249492381e-01, // itest=85: vxxxxx#9 nsp=-1 mass=-400 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=85: vxxxxx#9 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=86: sxxxxx#9 nsp=-1 mass=400 - 2.400000000000000e+02, 1.800000000000000e+02, // itest=86: sxxxxx#9 nsp=-1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=86: sxxxxx#9 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=86: sxxxxx#9 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=86: sxxxxx#9 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=86: sxxxxx#9 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=87: sxxxxx#9 nsp=-1 mass=-400 - 2.400000000000000e+02, 1.800000000000000e+02, // itest=87: sxxxxx#9 nsp=-1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=87: sxxxxx#9 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=87: sxxxxx#9 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=87: sxxxxx#9 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=87: sxxxxx#9 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=88: oxxxxx#9 nsp=-1 mass=400 - 2.400000000000000e+02, 1.800000000000000e+02, // itest=88: oxxxxx#9 nsp=-1 mass=400 - 7.999999999999999e+00, 5.999999999999999e+00, // itest=88: oxxxxx#9 nsp=-1 mass=400 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=88: oxxxxx#9 nsp=-1 mass=400 - -1.600000000000000e+01, -1.200000000000000e+01, // itest=88: oxxxxx#9 nsp=-1 mass=400 - -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=88: oxxxxx#9 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=89: oxxxxx#9 nsp=-1 mass=-400 - 2.400000000000000e+02, 1.800000000000000e+02, // itest=89: oxxxxx#9 nsp=-1 mass=-400 - -7.999999999999999e+00, -5.999999999999999e+00, // itest=89: oxxxxx#9 nsp=-1 mass=-400 - -1.000000000000000e+01, -0.000000000000000e+00, // itest=89: oxxxxx#9 nsp=-1 mass=-400 - -1.600000000000000e+01, -1.200000000000000e+01, // itest=89: oxxxxx#9 nsp=-1 mass=-400 - -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=89: oxxxxx#9 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 1.440000000000000e+02, // itest=90: ixxxxx#10 nsp=-1 mass=400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=90: ixxxxx#10 nsp=-1 mass=400 - 9.863939238321439e+00, -1.052153518754287e+01, // itest=90: ixxxxx#10 nsp=-1 mass=400 - -2.433105012119288e+01, -0.000000000000000e+00, // itest=90: ixxxxx#10 nsp=-1 mass=400 - -4.931969619160719e+00, 5.260767593771432e+00, // itest=90: ixxxxx#10 nsp=-1 mass=400 - 1.216552506059644e+01, 0.000000000000000e+00 } ); // itest=90: ixxxxx#10 nsp=-1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 1.440000000000000e+02, // itest=91: ixxxxx#10 nsp=-1 mass=-400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=91: ixxxxx#10 nsp=-1 mass=-400 - 9.863939238321439e+00, -1.052153518754287e+01, // itest=91: ixxxxx#10 nsp=-1 mass=-400 - -2.433105012119288e+01, -0.000000000000000e+00, // itest=91: ixxxxx#10 nsp=-1 mass=-400 - 4.931969619160719e+00, -5.260767593771432e+00, // itest=91: ixxxxx#10 nsp=-1 mass=-400 - -1.216552506059644e+01, -0.000000000000000e+00 } ); // itest=91: ixxxxx#10 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -1.440000000000000e+02, // itest=92: vxxxxx#10 nsp=-1 mass=400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=92: vxxxxx#10 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=92: vxxxxx#10 nsp=-1 mass=400 - -2.321373168788980e-01, 5.158607041753289e-01, // itest=92: vxxxxx#10 nsp=-1 mass=400 - -2.476131380041579e-01, -4.836194101643708e-01, // itest=92: vxxxxx#10 nsp=-1 mass=400 - 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=92: vxxxxx#10 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -1.440000000000000e+02, // itest=93: vxxxxx#10 nsp=-1 mass=-400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=93: vxxxxx#10 nsp=-1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=93: vxxxxx#10 nsp=-1 mass=-400 - -2.321373168788980e-01, 5.158607041753289e-01, // itest=93: vxxxxx#10 nsp=-1 mass=-400 - -2.476131380041579e-01, -4.836194101643708e-01, // itest=93: vxxxxx#10 nsp=-1 mass=-400 - 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=93: vxxxxx#10 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -1.440000000000000e+02, // itest=94: sxxxxx#10 nsp=-1 mass=400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=94: sxxxxx#10 nsp=-1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=94: sxxxxx#10 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=94: sxxxxx#10 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=94: sxxxxx#10 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=94: sxxxxx#10 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -1.440000000000000e+02, // itest=95: sxxxxx#10 nsp=-1 mass=-400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=95: sxxxxx#10 nsp=-1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=95: sxxxxx#10 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=95: sxxxxx#10 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=95: sxxxxx#10 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=95: sxxxxx#10 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -1.440000000000000e+02, // itest=96: oxxxxx#10 nsp=-1 mass=400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=96: oxxxxx#10 nsp=-1 mass=400 - -4.931969619160719e+00, -5.260767593771432e+00, // itest=96: oxxxxx#10 nsp=-1 mass=400 - 1.216552506059644e+01, 0.000000000000000e+00, // itest=96: oxxxxx#10 nsp=-1 mass=400 - 9.863939238321439e+00, 1.052153518754287e+01, // itest=96: oxxxxx#10 nsp=-1 mass=400 - -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=96: oxxxxx#10 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -1.440000000000000e+02, // itest=97: oxxxxx#10 nsp=-1 mass=-400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=97: oxxxxx#10 nsp=-1 mass=-400 - 4.931969619160719e+00, 5.260767593771432e+00, // itest=97: oxxxxx#10 nsp=-1 mass=-400 - -1.216552506059644e+01, -0.000000000000000e+00, // itest=97: oxxxxx#10 nsp=-1 mass=-400 - 9.863939238321439e+00, 1.052153518754287e+01, // itest=97: oxxxxx#10 nsp=-1 mass=-400 - -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=97: oxxxxx#10 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -1.440000000000000e+02, // itest=98: ixxxxx#11 nsp=-1 mass=400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=98: ixxxxx#11 nsp=-1 mass=400 - 1.664100588675688e+01, -1.775040627920733e+01, // itest=98: ixxxxx#11 nsp=-1 mass=400 - -1.442220510185596e+01, -0.000000000000000e+00, // itest=98: ixxxxx#11 nsp=-1 mass=400 - -8.320502943378436e+00, 8.875203139603666e+00, // itest=98: ixxxxx#11 nsp=-1 mass=400 - 7.211102550927978e+00, 0.000000000000000e+00 } ); // itest=98: ixxxxx#11 nsp=-1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -1.440000000000000e+02, // itest=99: ixxxxx#11 nsp=-1 mass=-400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=99: ixxxxx#11 nsp=-1 mass=-400 - 1.664100588675688e+01, -1.775040627920733e+01, // itest=99: ixxxxx#11 nsp=-1 mass=-400 - -1.442220510185596e+01, -0.000000000000000e+00, // itest=99: ixxxxx#11 nsp=-1 mass=-400 - 8.320502943378436e+00, -8.875203139603666e+00, // itest=99: ixxxxx#11 nsp=-1 mass=-400 - -7.211102550927978e+00, -0.000000000000000e+00 } ); // itest=99: ixxxxx#11 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 1.440000000000000e+02, // itest=100: vxxxxx#11 nsp=-1 mass=400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=100: vxxxxx#11 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=100: vxxxxx#11 nsp=-1 mass=400 - 2.321373168788980e-01, 5.158607041753289e-01, // itest=100: vxxxxx#11 nsp=-1 mass=400 - 2.476131380041579e-01, -4.836194101643708e-01, // itest=100: vxxxxx#11 nsp=-1 mass=400 - 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=100: vxxxxx#11 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 1.440000000000000e+02, // itest=101: vxxxxx#11 nsp=-1 mass=-400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=101: vxxxxx#11 nsp=-1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=101: vxxxxx#11 nsp=-1 mass=-400 - 2.321373168788980e-01, 5.158607041753289e-01, // itest=101: vxxxxx#11 nsp=-1 mass=-400 - 2.476131380041579e-01, -4.836194101643708e-01, // itest=101: vxxxxx#11 nsp=-1 mass=-400 - 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=101: vxxxxx#11 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 1.440000000000000e+02, // itest=102: sxxxxx#11 nsp=-1 mass=400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=102: sxxxxx#11 nsp=-1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=102: sxxxxx#11 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=102: sxxxxx#11 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=102: sxxxxx#11 nsp=-1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=102: sxxxxx#11 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 1.440000000000000e+02, // itest=103: sxxxxx#11 nsp=-1 mass=-400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=103: sxxxxx#11 nsp=-1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=103: sxxxxx#11 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=103: sxxxxx#11 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=103: sxxxxx#11 nsp=-1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=103: sxxxxx#11 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 1.440000000000000e+02, // itest=104: oxxxxx#11 nsp=-1 mass=400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=104: oxxxxx#11 nsp=-1 mass=400 - -8.320502943378436e+00, -8.875203139603666e+00, // itest=104: oxxxxx#11 nsp=-1 mass=400 - 7.211102550927978e+00, 0.000000000000000e+00, // itest=104: oxxxxx#11 nsp=-1 mass=400 - 1.664100588675688e+01, 1.775040627920733e+01, // itest=104: oxxxxx#11 nsp=-1 mass=400 - -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=104: oxxxxx#11 nsp=-1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 1.440000000000000e+02, // itest=105: oxxxxx#11 nsp=-1 mass=-400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=105: oxxxxx#11 nsp=-1 mass=-400 - 8.320502943378436e+00, 8.875203139603666e+00, // itest=105: oxxxxx#11 nsp=-1 mass=-400 - -7.211102550927978e+00, -0.000000000000000e+00, // itest=105: oxxxxx#11 nsp=-1 mass=-400 - 1.664100588675688e+01, 1.775040627920733e+01, // itest=105: oxxxxx#11 nsp=-1 mass=-400 - -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=105: oxxxxx#11 nsp=-1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=106: ixxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=106: ixxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=106: ixxxxx#12 nsp=-1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00, // itest=106: ixxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=106: ixxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=106: ixxxxx#12 nsp=-1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=107: ixxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=107: ixxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=107: ixxxxx#12 nsp=-1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00, // itest=107: ixxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=107: ixxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=107: ixxxxx#12 nsp=-1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=108: ipzxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=108: ipzxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=108: ipzxxx#12 nsp=-1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00, // itest=108: ipzxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=108: ipzxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=108: ipzxxx#12 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=109: vxxxxx#12 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=109: vxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=109: vxxxxx#12 nsp=-1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=109: vxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=109: vxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=109: vxxxxx#12 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=110: vxxxxx#12 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=110: vxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=110: vxxxxx#12 nsp=-1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=110: vxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=110: vxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=110: vxxxxx#12 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=111: sxxxxx#12 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=111: sxxxxx#12 nsp=-1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=111: sxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=111: sxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=111: sxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=111: sxxxxx#12 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=112: sxxxxx#12 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=112: sxxxxx#12 nsp=-1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=112: sxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=112: sxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=112: sxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=112: sxxxxx#12 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=113: oxxxxx#12 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=113: oxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=113: oxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=113: oxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=113: oxxxxx#12 nsp=-1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=113: oxxxxx#12 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=114: oxxxxx#12 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=114: oxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=114: oxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=114: oxxxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=114: oxxxxx#12 nsp=-1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=114: oxxxxx#12 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=115: opzxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=115: opzxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=115: opzxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=115: opzxxx#12 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=115: opzxxx#12 nsp=-1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=115: opzxxx#12 nsp=-1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=116: ixxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=116: ixxxxx#13 nsp=-1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00, // itest=116: ixxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=116: ixxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=116: ixxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=116: ixxxxx#13 nsp=-1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=117: ixxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=117: ixxxxx#13 nsp=-1 mass=0 - -3.162277660168379e+01, -0.000000000000000e+00, // itest=117: ixxxxx#13 nsp=-1 mass=0 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=117: ixxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=117: ixxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=117: ixxxxx#13 nsp=-1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=118: imzxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=118: imzxxx#13 nsp=-1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00, // itest=118: imzxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=118: imzxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=118: imzxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=118: imzxxx#13 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=119: vxxxxx#13 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=119: vxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=119: vxxxxx#13 nsp=-1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=119: vxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=119: vxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=119: vxxxxx#13 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=120: vxxxxx#13 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=120: vxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=120: vxxxxx#13 nsp=-1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=120: vxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=120: vxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=120: vxxxxx#13 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=121: sxxxxx#13 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=121: sxxxxx#13 nsp=-1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=121: sxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=121: sxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=121: sxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=121: sxxxxx#13 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=122: sxxxxx#13 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=122: sxxxxx#13 nsp=-1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=122: sxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=122: sxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=122: sxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=122: sxxxxx#13 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=123: oxxxxx#13 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=123: oxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=123: oxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=123: oxxxxx#13 nsp=-1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00, // itest=123: oxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=123: oxxxxx#13 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=124: oxxxxx#13 nsp=-1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=124: oxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=124: oxxxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=124: oxxxxx#13 nsp=-1 mass=0 - -3.162277660168379e+01, -0.000000000000000e+00, // itest=124: oxxxxx#13 nsp=-1 mass=0 - -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=124: oxxxxx#13 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=125: omzxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=125: omzxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=125: omzxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=125: omzxxx#13 nsp=-1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00, // itest=125: omzxxx#13 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=125: omzxxx#13 nsp=-1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=126: ixxxxx#14 nsp=-1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=126: ixxxxx#14 nsp=-1 mass=0 - 1.341640786499874e+01, -1.788854381999832e+01, // itest=126: ixxxxx#14 nsp=-1 mass=0 - -2.236067977499790e+01, 0.000000000000000e+00, // itest=126: ixxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=126: ixxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=126: ixxxxx#14 nsp=-1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=127: ixxxxx#14 nsp=-1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=127: ixxxxx#14 nsp=-1 mass=0 - 1.341640786499874e+01, -1.788854381999832e+01, // itest=127: ixxxxx#14 nsp=-1 mass=0 - -2.236067977499790e+01, 0.000000000000000e+00, // itest=127: ixxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=127: ixxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=127: ixxxxx#14 nsp=-1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=128: ixzxxx#14 nsp=-1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=128: ixzxxx#14 nsp=-1 mass=0 - 1.341640786499874e+01, -1.788854381999832e+01, // itest=128: ixzxxx#14 nsp=-1 mass=0 - -2.236067977499790e+01, 0.000000000000000e+00, // itest=128: ixzxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=128: ixzxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=128: ixzxxx#14 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=129: vxxxxx#14 nsp=-1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=129: vxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=129: vxxxxx#14 nsp=-1 mass=0 - -0.000000000000000e+00, 5.656854249492381e-01, // itest=129: vxxxxx#14 nsp=-1 mass=0 - -0.000000000000000e+00, -4.242640687119285e-01, // itest=129: vxxxxx#14 nsp=-1 mass=0 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=129: vxxxxx#14 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=130: vxxxxx#14 nsp=-1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=130: vxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=130: vxxxxx#14 nsp=-1 mass=0 - -0.000000000000000e+00, 5.656854249492381e-01, // itest=130: vxxxxx#14 nsp=-1 mass=0 - -0.000000000000000e+00, -4.242640687119285e-01, // itest=130: vxxxxx#14 nsp=-1 mass=0 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=130: vxxxxx#14 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=131: sxxxxx#14 nsp=-1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=131: sxxxxx#14 nsp=-1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=131: sxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=131: sxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=131: sxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=131: sxxxxx#14 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=132: sxxxxx#14 nsp=-1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=132: sxxxxx#14 nsp=-1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=132: sxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=132: sxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=132: sxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=132: sxxxxx#14 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=133: oxxxxx#14 nsp=-1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=133: oxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=133: oxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=133: oxxxxx#14 nsp=-1 mass=0 - 1.341640786499874e+01, 1.788854381999832e+01, // itest=133: oxxxxx#14 nsp=-1 mass=0 - -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=133: oxxxxx#14 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=134: oxxxxx#14 nsp=-1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=134: oxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=134: oxxxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=134: oxxxxx#14 nsp=-1 mass=0 - 1.341640786499874e+01, 1.788854381999832e+01, // itest=134: oxxxxx#14 nsp=-1 mass=0 - -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=134: oxxxxx#14 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=135: oxzxxx#14 nsp=-1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=135: oxzxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=135: oxzxxx#14 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=135: oxzxxx#14 nsp=-1 mass=0 - 1.341640786499874e+01, 1.788854381999832e+01, // itest=135: oxzxxx#14 nsp=-1 mass=0 - -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=135: oxzxxx#14 nsp=-1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=136: ixxxxx#15 nsp=-1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=136: ixxxxx#15 nsp=-1 mass=0 - 6.000000000000000e+00, -8.000000000000000e+00, // itest=136: ixxxxx#15 nsp=-1 mass=0 - -3.000000000000000e+01, 0.000000000000000e+00, // itest=136: ixxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=136: ixxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=136: ixxxxx#15 nsp=-1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=137: ixxxxx#15 nsp=-1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=137: ixxxxx#15 nsp=-1 mass=0 - 6.000000000000000e+00, -8.000000000000000e+00, // itest=137: ixxxxx#15 nsp=-1 mass=0 - -3.000000000000000e+01, 0.000000000000000e+00, // itest=137: ixxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=137: ixxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=137: ixxxxx#15 nsp=-1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=138: ixzxxx#15 nsp=-1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=138: ixzxxx#15 nsp=-1 mass=0 - 6.000000000000000e+00, -8.000000000000000e+00, // itest=138: ixzxxx#15 nsp=-1 mass=0 - -3.000000000000000e+01, 0.000000000000000e+00, // itest=138: ixzxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=138: ixzxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=138: ixzxxx#15 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=139: vxxxxx#15 nsp=-1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=139: vxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=139: vxxxxx#15 nsp=-1 mass=0 - -3.394112549695428e-01, 5.656854249492381e-01, // itest=139: vxxxxx#15 nsp=-1 mass=0 - -4.525483399593904e-01, -4.242640687119285e-01, // itest=139: vxxxxx#15 nsp=-1 mass=0 - 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=139: vxxxxx#15 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=140: vxxxxx#15 nsp=-1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=140: vxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=140: vxxxxx#15 nsp=-1 mass=0 - -3.394112549695428e-01, 5.656854249492381e-01, // itest=140: vxxxxx#15 nsp=-1 mass=0 - -4.525483399593904e-01, -4.242640687119285e-01, // itest=140: vxxxxx#15 nsp=-1 mass=0 - 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=140: vxxxxx#15 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=141: sxxxxx#15 nsp=-1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=141: sxxxxx#15 nsp=-1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=141: sxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=141: sxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=141: sxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=141: sxxxxx#15 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=142: sxxxxx#15 nsp=-1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=142: sxxxxx#15 nsp=-1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=142: sxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=142: sxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=142: sxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=142: sxxxxx#15 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=143: oxxxxx#15 nsp=-1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=143: oxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=143: oxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=143: oxxxxx#15 nsp=-1 mass=0 - 6.000000000000000e+00, 8.000000000000000e+00, // itest=143: oxxxxx#15 nsp=-1 mass=0 - -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=143: oxxxxx#15 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=144: oxxxxx#15 nsp=-1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=144: oxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=144: oxxxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=144: oxxxxx#15 nsp=-1 mass=0 - 6.000000000000000e+00, 8.000000000000000e+00, // itest=144: oxxxxx#15 nsp=-1 mass=0 - -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=144: oxxxxx#15 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=145: oxzxxx#15 nsp=-1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=145: oxzxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=145: oxzxxx#15 nsp=-1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=145: oxzxxx#15 nsp=-1 mass=0 - 6.000000000000000e+00, 8.000000000000000e+00, // itest=145: oxzxxx#15 nsp=-1 mass=0 - -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=145: oxzxxx#15 nsp=-1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=146: ixxxxx#0 nsp=1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=146: ixxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=146: ixxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=146: ixxxxx#0 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=146: ixxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=146: ixxxxx#0 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=147: ixxxxx#0 nsp=1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=147: ixxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=147: ixxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=147: ixxxxx#0 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=147: ixxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=147: ixxxxx#0 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=148: ipzxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=148: ipzxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=148: ipzxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=148: ipzxxx#0 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=148: ipzxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=148: ipzxxx#0 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=149: vxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=149: vxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=149: vxxxxx#0 nsp=1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=149: vxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=149: vxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=149: vxxxxx#0 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=150: vxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=150: vxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=150: vxxxxx#0 nsp=1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=150: vxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=150: vxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=150: vxxxxx#0 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=151: sxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=151: sxxxxx#0 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=151: sxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=151: sxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=151: sxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=151: sxxxxx#0 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=152: sxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=152: sxxxxx#0 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=152: sxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=152: sxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=152: sxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=152: sxxxxx#0 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=153: oxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=153: oxxxxx#0 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=153: oxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=153: oxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=153: oxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=153: oxxxxx#0 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=154: oxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=154: oxxxxx#0 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=154: oxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=154: oxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=154: oxxxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=154: oxxxxx#0 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=155: opzxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=155: opzxxx#0 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=155: opzxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=155: opzxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=155: opzxxx#0 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=155: opzxxx#0 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=156: ixxxxx#1 nsp=1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=156: ixxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=156: ixxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=156: ixxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=156: ixxxxx#1 nsp=1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=156: ixxxxx#1 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=157: ixxxxx#1 nsp=1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=157: ixxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=157: ixxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=157: ixxxxx#1 nsp=1 mass=0 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=157: ixxxxx#1 nsp=1 mass=0 - -3.162277660168379e+01, -0.000000000000000e+00 } ); // itest=157: ixxxxx#1 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=158: imzxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=158: imzxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=158: imzxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=158: imzxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=158: imzxxx#1 nsp=1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=158: imzxxx#1 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=159: vxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=159: vxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=159: vxxxxx#1 nsp=1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=159: vxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=159: vxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=159: vxxxxx#1 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=160: vxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=160: vxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=160: vxxxxx#1 nsp=1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=160: vxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=160: vxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=160: vxxxxx#1 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=161: sxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=161: sxxxxx#1 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=161: sxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=161: sxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=161: sxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=161: sxxxxx#1 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=162: sxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=162: sxxxxx#1 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=162: sxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=162: sxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=162: sxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=162: sxxxxx#1 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=163: oxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=163: oxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=163: oxxxxx#1 nsp=1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00, // itest=163: oxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=163: oxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=163: oxxxxx#1 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=164: oxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=164: oxxxxx#1 nsp=1 mass=0 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=164: oxxxxx#1 nsp=1 mass=0 - -3.162277660168379e+01, -0.000000000000000e+00, // itest=164: oxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=164: oxxxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=164: oxxxxx#1 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=165: omzxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=165: omzxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=165: omzxxx#1 nsp=1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00, // itest=165: omzxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=165: omzxxx#1 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=165: omzxxx#1 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=166: ixxxxx#2 nsp=1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=166: ixxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=166: ixxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=166: ixxxxx#2 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=166: ixxxxx#2 nsp=1 mass=0 - 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=166: ixxxxx#2 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=167: ixxxxx#2 nsp=1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=167: ixxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=167: ixxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=167: ixxxxx#2 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=167: ixxxxx#2 nsp=1 mass=0 - 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=167: ixxxxx#2 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=168: ixzxxx#2 nsp=1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=168: ixzxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=168: ixzxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=168: ixzxxx#2 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=168: ixzxxx#2 nsp=1 mass=0 - 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=168: ixzxxx#2 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=169: vxxxxx#2 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=169: vxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=169: vxxxxx#2 nsp=1 mass=0 - -0.000000000000000e+00, -5.656854249492381e-01, // itest=169: vxxxxx#2 nsp=1 mass=0 - -0.000000000000000e+00, 4.242640687119285e-01, // itest=169: vxxxxx#2 nsp=1 mass=0 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=169: vxxxxx#2 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=170: vxxxxx#2 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=170: vxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=170: vxxxxx#2 nsp=1 mass=0 - -0.000000000000000e+00, -5.656854249492381e-01, // itest=170: vxxxxx#2 nsp=1 mass=0 - -0.000000000000000e+00, 4.242640687119285e-01, // itest=170: vxxxxx#2 nsp=1 mass=0 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=170: vxxxxx#2 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=171: sxxxxx#2 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=171: sxxxxx#2 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=171: sxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=171: sxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=171: sxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=171: sxxxxx#2 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=172: sxxxxx#2 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=172: sxxxxx#2 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=172: sxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=172: sxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=172: sxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=172: sxxxxx#2 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=173: oxxxxx#2 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=173: oxxxxx#2 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=173: oxxxxx#2 nsp=1 mass=0 - 1.341640786499874e+01, -1.788854381999832e+01, // itest=173: oxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=173: oxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=173: oxxxxx#2 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=174: oxxxxx#2 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=174: oxxxxx#2 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=174: oxxxxx#2 nsp=1 mass=0 - 1.341640786499874e+01, -1.788854381999832e+01, // itest=174: oxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=174: oxxxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=174: oxxxxx#2 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=175: oxzxxx#2 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=175: oxzxxx#2 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=175: oxzxxx#2 nsp=1 mass=0 - 1.341640786499874e+01, -1.788854381999832e+01, // itest=175: oxzxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=175: oxzxxx#2 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=175: oxzxxx#2 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=176: ixxxxx#3 nsp=1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=176: ixxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=176: ixxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=176: ixxxxx#3 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=176: ixxxxx#3 nsp=1 mass=0 - 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=176: ixxxxx#3 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=177: ixxxxx#3 nsp=1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=177: ixxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=177: ixxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=177: ixxxxx#3 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=177: ixxxxx#3 nsp=1 mass=0 - 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=177: ixxxxx#3 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=178: ixzxxx#3 nsp=1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=178: ixzxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=178: ixzxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=178: ixzxxx#3 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=178: ixzxxx#3 nsp=1 mass=0 - 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=178: ixzxxx#3 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=179: vxxxxx#3 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=179: vxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=179: vxxxxx#3 nsp=1 mass=0 - -3.394112549695428e-01, -5.656854249492381e-01, // itest=179: vxxxxx#3 nsp=1 mass=0 - -4.525483399593904e-01, 4.242640687119285e-01, // itest=179: vxxxxx#3 nsp=1 mass=0 - 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=179: vxxxxx#3 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=180: vxxxxx#3 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=180: vxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=180: vxxxxx#3 nsp=1 mass=0 - -3.394112549695428e-01, -5.656854249492381e-01, // itest=180: vxxxxx#3 nsp=1 mass=0 - -4.525483399593904e-01, 4.242640687119285e-01, // itest=180: vxxxxx#3 nsp=1 mass=0 - 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=180: vxxxxx#3 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=181: sxxxxx#3 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=181: sxxxxx#3 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=181: sxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=181: sxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=181: sxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=181: sxxxxx#3 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=182: sxxxxx#3 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=182: sxxxxx#3 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=182: sxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=182: sxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=182: sxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=182: sxxxxx#3 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=183: oxxxxx#3 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=183: oxxxxx#3 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=183: oxxxxx#3 nsp=1 mass=0 - 6.000000000000000e+00, -8.000000000000000e+00, // itest=183: oxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=183: oxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=183: oxxxxx#3 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=184: oxxxxx#3 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=184: oxxxxx#3 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=184: oxxxxx#3 nsp=1 mass=0 - 6.000000000000000e+00, -8.000000000000000e+00, // itest=184: oxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=184: oxxxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=184: oxxxxx#3 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=185: oxzxxx#3 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=185: oxzxxx#3 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=185: oxzxxx#3 nsp=1 mass=0 - 6.000000000000000e+00, -8.000000000000000e+00, // itest=185: oxzxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=185: oxzxxx#3 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=185: oxzxxx#3 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 4.000000000000000e+02, // itest=186: ixxxxx#4 nsp=1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=186: ixxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=186: ixxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=186: ixxxxx#4 nsp=1 mass=0 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=186: ixxxxx#4 nsp=1 mass=0 - 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=186: ixxxxx#4 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 4.000000000000000e+02, // itest=187: ixxxxx#4 nsp=1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=187: ixxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=187: ixxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=187: ixxxxx#4 nsp=1 mass=0 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=187: ixxxxx#4 nsp=1 mass=0 - 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=187: ixxxxx#4 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 4.000000000000000e+02, // itest=188: ixzxxx#4 nsp=1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=188: ixzxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=188: ixzxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=188: ixzxxx#4 nsp=1 mass=0 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=188: ixzxxx#4 nsp=1 mass=0 - 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=188: ixzxxx#4 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -4.000000000000000e+02, // itest=189: vxxxxx#4 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=189: vxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=189: vxxxxx#4 nsp=1 mass=0 - 3.394112549695428e-01, -5.656854249492381e-01, // itest=189: vxxxxx#4 nsp=1 mass=0 - 4.525483399593904e-01, 4.242640687119285e-01, // itest=189: vxxxxx#4 nsp=1 mass=0 - 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=189: vxxxxx#4 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -4.000000000000000e+02, // itest=190: vxxxxx#4 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=190: vxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=190: vxxxxx#4 nsp=1 mass=0 - 3.394112549695428e-01, -5.656854249492381e-01, // itest=190: vxxxxx#4 nsp=1 mass=0 - 4.525483399593904e-01, 4.242640687119285e-01, // itest=190: vxxxxx#4 nsp=1 mass=0 - 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=190: vxxxxx#4 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -4.000000000000000e+02, // itest=191: sxxxxx#4 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=191: sxxxxx#4 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=191: sxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=191: sxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=191: sxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=191: sxxxxx#4 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -4.000000000000000e+02, // itest=192: sxxxxx#4 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=192: sxxxxx#4 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=192: sxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=192: sxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=192: sxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=192: sxxxxx#4 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -4.000000000000000e+02, // itest=193: oxxxxx#4 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=193: oxxxxx#4 nsp=1 mass=0 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=193: oxxxxx#4 nsp=1 mass=0 - 1.800000000000000e+01, -2.400000000000000e+01, // itest=193: oxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=193: oxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=193: oxxxxx#4 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -4.000000000000000e+02, // itest=194: oxxxxx#4 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=194: oxxxxx#4 nsp=1 mass=0 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=194: oxxxxx#4 nsp=1 mass=0 - 1.800000000000000e+01, -2.400000000000000e+01, // itest=194: oxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=194: oxxxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=194: oxxxxx#4 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -4.000000000000000e+02, // itest=195: oxzxxx#4 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=195: oxzxxx#4 nsp=1 mass=0 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=195: oxzxxx#4 nsp=1 mass=0 - 1.800000000000000e+01, -2.400000000000000e+01, // itest=195: oxzxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=195: oxzxxx#4 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=195: oxzxxx#4 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=196: ixxxxx#5 nsp=1 mass=500 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=196: ixxxxx#5 nsp=1 mass=500 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=196: ixxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=196: ixxxxx#5 nsp=1 mass=500 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=196: ixxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=196: ixxxxx#5 nsp=1 mass=500 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=197: ixxxxx#5 nsp=1 mass=-500 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=197: ixxxxx#5 nsp=1 mass=-500 - -2.236067977499790e+01, 0.000000000000000e+00, // itest=197: ixxxxx#5 nsp=1 mass=-500 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=197: ixxxxx#5 nsp=1 mass=-500 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=197: ixxxxx#5 nsp=1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=197: ixxxxx#5 nsp=1 mass=-500 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=198: vxxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=198: vxxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=198: vxxxxx#5 nsp=1 mass=500 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=198: vxxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=198: vxxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=198: vxxxxx#5 nsp=1 mass=500 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=199: vxxxxx#5 nsp=1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=199: vxxxxx#5 nsp=1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=199: vxxxxx#5 nsp=1 mass=-500 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=199: vxxxxx#5 nsp=1 mass=-500 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=199: vxxxxx#5 nsp=1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=199: vxxxxx#5 nsp=1 mass=-500 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=200: sxxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=200: sxxxxx#5 nsp=1 mass=500 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=200: sxxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=200: sxxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=200: sxxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=200: sxxxxx#5 nsp=1 mass=500 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=201: sxxxxx#5 nsp=1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=201: sxxxxx#5 nsp=1 mass=-500 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=201: sxxxxx#5 nsp=1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=201: sxxxxx#5 nsp=1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=201: sxxxxx#5 nsp=1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=201: sxxxxx#5 nsp=1 mass=-500 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=202: oxxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=202: oxxxxx#5 nsp=1 mass=500 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=202: oxxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=202: oxxxxx#5 nsp=1 mass=500 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=202: oxxxxx#5 nsp=1 mass=500 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=202: oxxxxx#5 nsp=1 mass=500 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=203: oxxxxx#5 nsp=1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=203: oxxxxx#5 nsp=1 mass=-500 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=203: oxxxxx#5 nsp=1 mass=-500 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=203: oxxxxx#5 nsp=1 mass=-500 - -2.236067977499790e+01, 0.000000000000000e+00, // itest=203: oxxxxx#5 nsp=1 mass=-500 - -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=203: oxxxxx#5 nsp=1 mass=-500 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -3.000000000000000e+02, // itest=204: ixxxxx#6 nsp=1 mass=400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=204: ixxxxx#6 nsp=1 mass=400 - 1.414213562373095e+01, 0.000000000000000e+00, // itest=204: ixxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=204: ixxxxx#6 nsp=1 mass=400 - 2.828427124746190e+01, 0.000000000000000e+00, // itest=204: ixxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=204: ixxxxx#6 nsp=1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -3.000000000000000e+02, // itest=205: ixxxxx#6 nsp=1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=205: ixxxxx#6 nsp=1 mass=-400 - -1.414213562373095e+01, -0.000000000000000e+00, // itest=205: ixxxxx#6 nsp=1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=205: ixxxxx#6 nsp=1 mass=-400 - 2.828427124746190e+01, 0.000000000000000e+00, // itest=205: ixxxxx#6 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=205: ixxxxx#6 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 3.000000000000000e+02, // itest=206: vxxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=206: vxxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=206: vxxxxx#6 nsp=1 mass=400 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=206: vxxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=206: vxxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=206: vxxxxx#6 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 3.000000000000000e+02, // itest=207: vxxxxx#6 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=207: vxxxxx#6 nsp=1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=207: vxxxxx#6 nsp=1 mass=-400 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=207: vxxxxx#6 nsp=1 mass=-400 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=207: vxxxxx#6 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=207: vxxxxx#6 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 3.000000000000000e+02, // itest=208: sxxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=208: sxxxxx#6 nsp=1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=208: sxxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=208: sxxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=208: sxxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=208: sxxxxx#6 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 3.000000000000000e+02, // itest=209: sxxxxx#6 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=209: sxxxxx#6 nsp=1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=209: sxxxxx#6 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=209: sxxxxx#6 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=209: sxxxxx#6 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=209: sxxxxx#6 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 3.000000000000000e+02, // itest=210: oxxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=210: oxxxxx#6 nsp=1 mass=400 - 2.828427124746190e+01, 0.000000000000000e+00, // itest=210: oxxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=210: oxxxxx#6 nsp=1 mass=400 - 1.414213562373095e+01, 0.000000000000000e+00, // itest=210: oxxxxx#6 nsp=1 mass=400 - 0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=210: oxxxxx#6 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 3.000000000000000e+02, // itest=211: oxxxxx#6 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=211: oxxxxx#6 nsp=1 mass=-400 - 2.828427124746190e+01, 0.000000000000000e+00, // itest=211: oxxxxx#6 nsp=1 mass=-400 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=211: oxxxxx#6 nsp=1 mass=-400 - -1.414213562373095e+01, -0.000000000000000e+00, // itest=211: oxxxxx#6 nsp=1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=211: oxxxxx#6 nsp=1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 3.000000000000000e+02, // itest=212: ixxxxx#7 nsp=1 mass=400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=212: ixxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=212: ixxxxx#7 nsp=1 mass=400 - -1.414213562373095e+01, 0.000000000000000e+00, // itest=212: ixxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=212: ixxxxx#7 nsp=1 mass=400 - -2.828427124746190e+01, 0.000000000000000e+00 } ); // itest=212: ixxxxx#7 nsp=1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 3.000000000000000e+02, // itest=213: ixxxxx#7 nsp=1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=213: ixxxxx#7 nsp=1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=213: ixxxxx#7 nsp=1 mass=-400 - 1.414213562373095e+01, -0.000000000000000e+00, // itest=213: ixxxxx#7 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=213: ixxxxx#7 nsp=1 mass=-400 - -2.828427124746190e+01, 0.000000000000000e+00 } ); // itest=213: ixxxxx#7 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -3.000000000000000e+02, // itest=214: vxxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=214: vxxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=214: vxxxxx#7 nsp=1 mass=400 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=214: vxxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=214: vxxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=214: vxxxxx#7 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -3.000000000000000e+02, // itest=215: vxxxxx#7 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=215: vxxxxx#7 nsp=1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=215: vxxxxx#7 nsp=1 mass=-400 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=215: vxxxxx#7 nsp=1 mass=-400 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=215: vxxxxx#7 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=215: vxxxxx#7 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -3.000000000000000e+02, // itest=216: sxxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=216: sxxxxx#7 nsp=1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=216: sxxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=216: sxxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=216: sxxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=216: sxxxxx#7 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -3.000000000000000e+02, // itest=217: sxxxxx#7 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=217: sxxxxx#7 nsp=1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=217: sxxxxx#7 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=217: sxxxxx#7 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=217: sxxxxx#7 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=217: sxxxxx#7 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -3.000000000000000e+02, // itest=218: oxxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=218: oxxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=218: oxxxxx#7 nsp=1 mass=400 - -2.828427124746190e+01, 0.000000000000000e+00, // itest=218: oxxxxx#7 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=218: oxxxxx#7 nsp=1 mass=400 - -1.414213562373095e+01, 0.000000000000000e+00 } ); // itest=218: oxxxxx#7 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -3.000000000000000e+02, // itest=219: oxxxxx#7 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=219: oxxxxx#7 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=219: oxxxxx#7 nsp=1 mass=-400 - -2.828427124746190e+01, 0.000000000000000e+00, // itest=219: oxxxxx#7 nsp=1 mass=-400 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=219: oxxxxx#7 nsp=1 mass=-400 - 1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=219: oxxxxx#7 nsp=1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=220: ixxxxx#8 nsp=1 mass=400 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=220: ixxxxx#8 nsp=1 mass=400 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=220: ixxxxx#8 nsp=1 mass=400 - 5.999999999999999e+00, 7.999999999999999e+00, // itest=220: ixxxxx#8 nsp=1 mass=400 - 2.000000000000000e+01, 0.000000000000000e+00, // itest=220: ixxxxx#8 nsp=1 mass=400 - 1.200000000000000e+01, 1.600000000000000e+01 } ); // itest=220: ixxxxx#8 nsp=1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=221: ixxxxx#8 nsp=1 mass=-400 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=221: ixxxxx#8 nsp=1 mass=-400 - -1.000000000000000e+01, -0.000000000000000e+00, // itest=221: ixxxxx#8 nsp=1 mass=-400 - -5.999999999999999e+00, -7.999999999999999e+00, // itest=221: ixxxxx#8 nsp=1 mass=-400 - 2.000000000000000e+01, 0.000000000000000e+00, // itest=221: ixxxxx#8 nsp=1 mass=-400 - 1.200000000000000e+01, 1.600000000000000e+01 } ); // itest=221: ixxxxx#8 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=222: vxxxxx#8 nsp=1 mass=400 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=222: vxxxxx#8 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=222: vxxxxx#8 nsp=1 mass=400 - 0.000000000000000e+00, -5.656854249492381e-01, // itest=222: vxxxxx#8 nsp=1 mass=400 - 0.000000000000000e+00, 4.242640687119285e-01, // itest=222: vxxxxx#8 nsp=1 mass=400 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=222: vxxxxx#8 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=223: vxxxxx#8 nsp=1 mass=-400 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=223: vxxxxx#8 nsp=1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=223: vxxxxx#8 nsp=1 mass=-400 - -0.000000000000000e+00, -5.656854249492381e-01, // itest=223: vxxxxx#8 nsp=1 mass=-400 - -0.000000000000000e+00, 4.242640687119285e-01, // itest=223: vxxxxx#8 nsp=1 mass=-400 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=223: vxxxxx#8 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=224: sxxxxx#8 nsp=1 mass=400 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=224: sxxxxx#8 nsp=1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=224: sxxxxx#8 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=224: sxxxxx#8 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=224: sxxxxx#8 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=224: sxxxxx#8 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=225: sxxxxx#8 nsp=1 mass=-400 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=225: sxxxxx#8 nsp=1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=225: sxxxxx#8 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=225: sxxxxx#8 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=225: sxxxxx#8 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=225: sxxxxx#8 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=226: oxxxxx#8 nsp=1 mass=400 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=226: oxxxxx#8 nsp=1 mass=400 - 2.000000000000000e+01, 0.000000000000000e+00, // itest=226: oxxxxx#8 nsp=1 mass=400 - 1.200000000000000e+01, -1.600000000000000e+01, // itest=226: oxxxxx#8 nsp=1 mass=400 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=226: oxxxxx#8 nsp=1 mass=400 - 5.999999999999999e+00, -7.999999999999999e+00 } ); // itest=226: oxxxxx#8 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=227: oxxxxx#8 nsp=1 mass=-400 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=227: oxxxxx#8 nsp=1 mass=-400 - 2.000000000000000e+01, 0.000000000000000e+00, // itest=227: oxxxxx#8 nsp=1 mass=-400 - 1.200000000000000e+01, -1.600000000000000e+01, // itest=227: oxxxxx#8 nsp=1 mass=-400 - -1.000000000000000e+01, -0.000000000000000e+00, // itest=227: oxxxxx#8 nsp=1 mass=-400 - -5.999999999999999e+00, 7.999999999999999e+00 } ); // itest=227: oxxxxx#8 nsp=1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=228: ixxxxx#9 nsp=1 mass=400 - 2.400000000000000e+02, 1.800000000000000e+02, // itest=228: ixxxxx#9 nsp=1 mass=400 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=228: ixxxxx#9 nsp=1 mass=400 - -7.999999999999999e+00, -5.999999999999999e+00, // itest=228: ixxxxx#9 nsp=1 mass=400 - 2.000000000000000e+01, 0.000000000000000e+00, // itest=228: ixxxxx#9 nsp=1 mass=400 - -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=228: ixxxxx#9 nsp=1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=229: ixxxxx#9 nsp=1 mass=-400 - 2.400000000000000e+02, 1.800000000000000e+02, // itest=229: ixxxxx#9 nsp=1 mass=-400 - -1.000000000000000e+01, -0.000000000000000e+00, // itest=229: ixxxxx#9 nsp=1 mass=-400 - 7.999999999999999e+00, 5.999999999999999e+00, // itest=229: ixxxxx#9 nsp=1 mass=-400 - 2.000000000000000e+01, 0.000000000000000e+00, // itest=229: ixxxxx#9 nsp=1 mass=-400 - -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=229: ixxxxx#9 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=230: vxxxxx#9 nsp=1 mass=400 - -2.400000000000000e+02, -1.800000000000000e+02, // itest=230: vxxxxx#9 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=230: vxxxxx#9 nsp=1 mass=400 - 0.000000000000000e+00, 4.242640687119285e-01, // itest=230: vxxxxx#9 nsp=1 mass=400 - 0.000000000000000e+00, -5.656854249492381e-01, // itest=230: vxxxxx#9 nsp=1 mass=400 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=230: vxxxxx#9 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=231: vxxxxx#9 nsp=1 mass=-400 - -2.400000000000000e+02, -1.800000000000000e+02, // itest=231: vxxxxx#9 nsp=1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=231: vxxxxx#9 nsp=1 mass=-400 - 0.000000000000000e+00, 4.242640687119285e-01, // itest=231: vxxxxx#9 nsp=1 mass=-400 - 0.000000000000000e+00, -5.656854249492381e-01, // itest=231: vxxxxx#9 nsp=1 mass=-400 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=231: vxxxxx#9 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=232: sxxxxx#9 nsp=1 mass=400 - -2.400000000000000e+02, -1.800000000000000e+02, // itest=232: sxxxxx#9 nsp=1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=232: sxxxxx#9 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=232: sxxxxx#9 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=232: sxxxxx#9 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=232: sxxxxx#9 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=233: sxxxxx#9 nsp=1 mass=-400 - -2.400000000000000e+02, -1.800000000000000e+02, // itest=233: sxxxxx#9 nsp=1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=233: sxxxxx#9 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=233: sxxxxx#9 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=233: sxxxxx#9 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=233: sxxxxx#9 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=234: oxxxxx#9 nsp=1 mass=400 - -2.400000000000000e+02, -1.800000000000000e+02, // itest=234: oxxxxx#9 nsp=1 mass=400 - 2.000000000000000e+01, 0.000000000000000e+00, // itest=234: oxxxxx#9 nsp=1 mass=400 - -1.600000000000000e+01, 1.200000000000000e+01, // itest=234: oxxxxx#9 nsp=1 mass=400 - 1.000000000000000e+01, 0.000000000000000e+00, // itest=234: oxxxxx#9 nsp=1 mass=400 - -7.999999999999999e+00, 5.999999999999999e+00 } ); // itest=234: oxxxxx#9 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=235: oxxxxx#9 nsp=1 mass=-400 - -2.400000000000000e+02, -1.800000000000000e+02, // itest=235: oxxxxx#9 nsp=1 mass=-400 - 2.000000000000000e+01, 0.000000000000000e+00, // itest=235: oxxxxx#9 nsp=1 mass=-400 - -1.600000000000000e+01, 1.200000000000000e+01, // itest=235: oxxxxx#9 nsp=1 mass=-400 - -1.000000000000000e+01, -0.000000000000000e+00, // itest=235: oxxxxx#9 nsp=1 mass=-400 - 7.999999999999999e+00, -5.999999999999999e+00 } ); // itest=235: oxxxxx#9 nsp=1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -1.440000000000000e+02, // itest=236: ixxxxx#10 nsp=1 mass=400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=236: ixxxxx#10 nsp=1 mass=400 - 1.216552506059644e+01, 0.000000000000000e+00, // itest=236: ixxxxx#10 nsp=1 mass=400 - 4.931969619160719e+00, 5.260767593771432e+00, // itest=236: ixxxxx#10 nsp=1 mass=400 - 2.433105012119288e+01, 0.000000000000000e+00, // itest=236: ixxxxx#10 nsp=1 mass=400 - 9.863939238321439e+00, 1.052153518754287e+01 } ); // itest=236: ixxxxx#10 nsp=1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -1.440000000000000e+02, // itest=237: ixxxxx#10 nsp=1 mass=-400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=237: ixxxxx#10 nsp=1 mass=-400 - -1.216552506059644e+01, -0.000000000000000e+00, // itest=237: ixxxxx#10 nsp=1 mass=-400 - -4.931969619160719e+00, -5.260767593771432e+00, // itest=237: ixxxxx#10 nsp=1 mass=-400 - 2.433105012119288e+01, 0.000000000000000e+00, // itest=237: ixxxxx#10 nsp=1 mass=-400 - 9.863939238321439e+00, 1.052153518754287e+01 } ); // itest=237: ixxxxx#10 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 1.440000000000000e+02, // itest=238: vxxxxx#10 nsp=1 mass=400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=238: vxxxxx#10 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=238: vxxxxx#10 nsp=1 mass=400 - -2.321373168788980e-01, -5.158607041753289e-01, // itest=238: vxxxxx#10 nsp=1 mass=400 - -2.476131380041579e-01, 4.836194101643708e-01, // itest=238: vxxxxx#10 nsp=1 mass=400 - 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=238: vxxxxx#10 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 1.440000000000000e+02, // itest=239: vxxxxx#10 nsp=1 mass=-400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=239: vxxxxx#10 nsp=1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=239: vxxxxx#10 nsp=1 mass=-400 - -2.321373168788980e-01, -5.158607041753289e-01, // itest=239: vxxxxx#10 nsp=1 mass=-400 - -2.476131380041579e-01, 4.836194101643708e-01, // itest=239: vxxxxx#10 nsp=1 mass=-400 - 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=239: vxxxxx#10 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 1.440000000000000e+02, // itest=240: sxxxxx#10 nsp=1 mass=400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=240: sxxxxx#10 nsp=1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=240: sxxxxx#10 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=240: sxxxxx#10 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=240: sxxxxx#10 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=240: sxxxxx#10 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 1.440000000000000e+02, // itest=241: sxxxxx#10 nsp=1 mass=-400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=241: sxxxxx#10 nsp=1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=241: sxxxxx#10 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=241: sxxxxx#10 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=241: sxxxxx#10 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=241: sxxxxx#10 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 1.440000000000000e+02, // itest=242: oxxxxx#10 nsp=1 mass=400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=242: oxxxxx#10 nsp=1 mass=400 - 2.433105012119288e+01, 0.000000000000000e+00, // itest=242: oxxxxx#10 nsp=1 mass=400 - 9.863939238321439e+00, -1.052153518754287e+01, // itest=242: oxxxxx#10 nsp=1 mass=400 - 1.216552506059644e+01, 0.000000000000000e+00, // itest=242: oxxxxx#10 nsp=1 mass=400 - 4.931969619160719e+00, -5.260767593771432e+00 } ); // itest=242: oxxxxx#10 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 1.440000000000000e+02, // itest=243: oxxxxx#10 nsp=1 mass=-400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=243: oxxxxx#10 nsp=1 mass=-400 - 2.433105012119288e+01, 0.000000000000000e+00, // itest=243: oxxxxx#10 nsp=1 mass=-400 - 9.863939238321439e+00, -1.052153518754287e+01, // itest=243: oxxxxx#10 nsp=1 mass=-400 - -1.216552506059644e+01, -0.000000000000000e+00, // itest=243: oxxxxx#10 nsp=1 mass=-400 - -4.931969619160719e+00, 5.260767593771432e+00 } ); // itest=243: oxxxxx#10 nsp=1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 1.440000000000000e+02, // itest=244: ixxxxx#11 nsp=1 mass=400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=244: ixxxxx#11 nsp=1 mass=400 - 7.211102550927978e+00, 0.000000000000000e+00, // itest=244: ixxxxx#11 nsp=1 mass=400 - 8.320502943378436e+00, 8.875203139603666e+00, // itest=244: ixxxxx#11 nsp=1 mass=400 - 1.442220510185596e+01, 0.000000000000000e+00, // itest=244: ixxxxx#11 nsp=1 mass=400 - 1.664100588675688e+01, 1.775040627920733e+01 } ); // itest=244: ixxxxx#11 nsp=1 mass=400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 1.440000000000000e+02, // itest=245: ixxxxx#11 nsp=1 mass=-400 - -1.800000000000000e+02, -1.920000000000000e+02, // itest=245: ixxxxx#11 nsp=1 mass=-400 - -7.211102550927978e+00, -0.000000000000000e+00, // itest=245: ixxxxx#11 nsp=1 mass=-400 - -8.320502943378436e+00, -8.875203139603666e+00, // itest=245: ixxxxx#11 nsp=1 mass=-400 - 1.442220510185596e+01, 0.000000000000000e+00, // itest=245: ixxxxx#11 nsp=1 mass=-400 - 1.664100588675688e+01, 1.775040627920733e+01 } ); // itest=245: ixxxxx#11 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -1.440000000000000e+02, // itest=246: vxxxxx#11 nsp=1 mass=400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=246: vxxxxx#11 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=246: vxxxxx#11 nsp=1 mass=400 - 2.321373168788980e-01, -5.158607041753289e-01, // itest=246: vxxxxx#11 nsp=1 mass=400 - 2.476131380041579e-01, 4.836194101643708e-01, // itest=246: vxxxxx#11 nsp=1 mass=400 - 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=246: vxxxxx#11 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -1.440000000000000e+02, // itest=247: vxxxxx#11 nsp=1 mass=-400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=247: vxxxxx#11 nsp=1 mass=-400 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=247: vxxxxx#11 nsp=1 mass=-400 - 2.321373168788980e-01, -5.158607041753289e-01, // itest=247: vxxxxx#11 nsp=1 mass=-400 - 2.476131380041579e-01, 4.836194101643708e-01, // itest=247: vxxxxx#11 nsp=1 mass=-400 - 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=247: vxxxxx#11 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -1.440000000000000e+02, // itest=248: sxxxxx#11 nsp=1 mass=400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=248: sxxxxx#11 nsp=1 mass=400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=248: sxxxxx#11 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=248: sxxxxx#11 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=248: sxxxxx#11 nsp=1 mass=400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=248: sxxxxx#11 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -1.440000000000000e+02, // itest=249: sxxxxx#11 nsp=1 mass=-400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=249: sxxxxx#11 nsp=1 mass=-400 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=249: sxxxxx#11 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=249: sxxxxx#11 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=249: sxxxxx#11 nsp=1 mass=-400 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=249: sxxxxx#11 nsp=1 mass=-400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -1.440000000000000e+02, // itest=250: oxxxxx#11 nsp=1 mass=400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=250: oxxxxx#11 nsp=1 mass=400 - 1.442220510185596e+01, 0.000000000000000e+00, // itest=250: oxxxxx#11 nsp=1 mass=400 - 1.664100588675688e+01, -1.775040627920733e+01, // itest=250: oxxxxx#11 nsp=1 mass=400 - 7.211102550927978e+00, 0.000000000000000e+00, // itest=250: oxxxxx#11 nsp=1 mass=400 - 8.320502943378436e+00, -8.875203139603666e+00 } ); // itest=250: oxxxxx#11 nsp=1 mass=400 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -1.440000000000000e+02, // itest=251: oxxxxx#11 nsp=1 mass=-400 - 1.800000000000000e+02, 1.920000000000000e+02, // itest=251: oxxxxx#11 nsp=1 mass=-400 - 1.442220510185596e+01, 0.000000000000000e+00, // itest=251: oxxxxx#11 nsp=1 mass=-400 - 1.664100588675688e+01, -1.775040627920733e+01, // itest=251: oxxxxx#11 nsp=1 mass=-400 - -7.211102550927978e+00, -0.000000000000000e+00, // itest=251: oxxxxx#11 nsp=1 mass=-400 - -8.320502943378436e+00, 8.875203139603666e+00 } ); // itest=251: oxxxxx#11 nsp=1 mass=-400 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=252: ixxxxx#12 nsp=1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=252: ixxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=252: ixxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=252: ixxxxx#12 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=252: ixxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=252: ixxxxx#12 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=253: ixxxxx#12 nsp=1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=253: ixxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=253: ixxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=253: ixxxxx#12 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=253: ixxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=253: ixxxxx#12 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -5.000000000000000e+02, // itest=254: ipzxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=254: ipzxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=254: ipzxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=254: ipzxxx#12 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=254: ipzxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=254: ipzxxx#12 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=255: vxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=255: vxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=255: vxxxxx#12 nsp=1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=255: vxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=255: vxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=255: vxxxxx#12 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=256: vxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=256: vxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=256: vxxxxx#12 nsp=1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=256: vxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 7.071067811865476e-01, // itest=256: vxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=256: vxxxxx#12 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=257: sxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=257: sxxxxx#12 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=257: sxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=257: sxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=257: sxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=257: sxxxxx#12 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=258: sxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=258: sxxxxx#12 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=258: sxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=258: sxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=258: sxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=258: sxxxxx#12 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=259: oxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=259: oxxxxx#12 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=259: oxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=259: oxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=259: oxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=259: oxxxxx#12 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=260: oxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=260: oxxxxx#12 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=260: oxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, -0.000000000000000e+00, // itest=260: oxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=260: oxxxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=260: oxxxxx#12 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 5.000000000000000e+02, // itest=261: opzxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=261: opzxxx#12 nsp=1 mass=0 - 3.162277660168379e+01, 0.000000000000000e+00, // itest=261: opzxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=261: opzxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=261: opzxxx#12 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=261: opzxxx#12 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=262: ixxxxx#13 nsp=1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=262: ixxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=262: ixxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=262: ixxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=262: ixxxxx#13 nsp=1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=262: ixxxxx#13 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=263: ixxxxx#13 nsp=1 mass=0 - -0.000000000000000e+00, -0.000000000000000e+00, // itest=263: ixxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=263: ixxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=263: ixxxxx#13 nsp=1 mass=0 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=263: ixxxxx#13 nsp=1 mass=0 - -3.162277660168379e+01, -0.000000000000000e+00 } ); // itest=263: ixxxxx#13 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, 5.000000000000000e+02, // itest=264: imzxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=264: imzxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=264: imzxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=264: imzxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=264: imzxxx#13 nsp=1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=264: imzxxx#13 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=265: vxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=265: vxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=265: vxxxxx#13 nsp=1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=265: vxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=265: vxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=265: vxxxxx#13 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=266: vxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=266: vxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=266: vxxxxx#13 nsp=1 mass=0 - -7.071067811865476e-01, 0.000000000000000e+00, // itest=266: vxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, -7.071067811865476e-01, // itest=266: vxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=266: vxxxxx#13 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=267: sxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=267: sxxxxx#13 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=267: sxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=267: sxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=267: sxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=267: sxxxxx#13 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=268: sxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=268: sxxxxx#13 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=268: sxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=268: sxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=268: sxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=268: sxxxxx#13 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=269: oxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=269: oxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=269: oxxxxx#13 nsp=1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00, // itest=269: oxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=269: oxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=269: oxxxxx#13 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=270: oxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=270: oxxxxx#13 nsp=1 mass=0 - -0.000000000000000e+00, 0.000000000000000e+00, // itest=270: oxxxxx#13 nsp=1 mass=0 - -3.162277660168379e+01, -0.000000000000000e+00, // itest=270: oxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=270: oxxxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=270: oxxxxx#13 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, -5.000000000000000e+02, // itest=271: omzxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=271: omzxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=271: omzxxx#13 nsp=1 mass=0 - -3.162277660168379e+01, 0.000000000000000e+00, // itest=271: omzxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=271: omzxxx#13 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=271: omzxxx#13 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=272: ixxxxx#14 nsp=1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=272: ixxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=272: ixxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=272: ixxxxx#14 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=272: ixxxxx#14 nsp=1 mass=0 - 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=272: ixxxxx#14 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=273: ixxxxx#14 nsp=1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=273: ixxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=273: ixxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=273: ixxxxx#14 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=273: ixxxxx#14 nsp=1 mass=0 - 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=273: ixxxxx#14 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -0.000000000000000e+00, // itest=274: ixzxxx#14 nsp=1 mass=0 - -3.000000000000000e+02, -4.000000000000000e+02, // itest=274: ixzxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=274: ixzxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=274: ixzxxx#14 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=274: ixzxxx#14 nsp=1 mass=0 - 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=274: ixzxxx#14 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=275: vxxxxx#14 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=275: vxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=275: vxxxxx#14 nsp=1 mass=0 - -0.000000000000000e+00, -5.656854249492381e-01, // itest=275: vxxxxx#14 nsp=1 mass=0 - -0.000000000000000e+00, 4.242640687119285e-01, // itest=275: vxxxxx#14 nsp=1 mass=0 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=275: vxxxxx#14 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=276: vxxxxx#14 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=276: vxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=276: vxxxxx#14 nsp=1 mass=0 - -0.000000000000000e+00, -5.656854249492381e-01, // itest=276: vxxxxx#14 nsp=1 mass=0 - -0.000000000000000e+00, 4.242640687119285e-01, // itest=276: vxxxxx#14 nsp=1 mass=0 - 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=276: vxxxxx#14 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=277: sxxxxx#14 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=277: sxxxxx#14 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=277: sxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=277: sxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=277: sxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=277: sxxxxx#14 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=278: sxxxxx#14 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=278: sxxxxx#14 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=278: sxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=278: sxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=278: sxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=278: sxxxxx#14 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=279: oxxxxx#14 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=279: oxxxxx#14 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=279: oxxxxx#14 nsp=1 mass=0 - 1.341640786499874e+01, -1.788854381999832e+01, // itest=279: oxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=279: oxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=279: oxxxxx#14 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=280: oxxxxx#14 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=280: oxxxxx#14 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=280: oxxxxx#14 nsp=1 mass=0 - 1.341640786499874e+01, -1.788854381999832e+01, // itest=280: oxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=280: oxxxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=280: oxxxxx#14 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 0.000000000000000e+00, // itest=281: oxzxxx#14 nsp=1 mass=0 - 3.000000000000000e+02, 4.000000000000000e+02, // itest=281: oxzxxx#14 nsp=1 mass=0 - 2.236067977499790e+01, 0.000000000000000e+00, // itest=281: oxzxxx#14 nsp=1 mass=0 - 1.341640786499874e+01, -1.788854381999832e+01, // itest=281: oxzxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=281: oxzxxx#14 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=281: oxzxxx#14 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=282: ixxxxx#15 nsp=1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=282: ixxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=282: ixxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=282: ixxxxx#15 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=282: ixxxxx#15 nsp=1 mass=0 - 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=282: ixxxxx#15 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=283: ixxxxx#15 nsp=1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=283: ixxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=283: ixxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=283: ixxxxx#15 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=283: ixxxxx#15 nsp=1 mass=0 - 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=283: ixxxxx#15 nsp=1 mass=0 - expwfs.push_back( { // --------- - -5.000000000000000e+02, -4.000000000000000e+02, // itest=284: ixzxxx#15 nsp=1 mass=0 - -1.800000000000000e+02, -2.400000000000000e+02, // itest=284: ixzxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=284: ixzxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=284: ixzxxx#15 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=284: ixzxxx#15 nsp=1 mass=0 - 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=284: ixzxxx#15 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=285: vxxxxx#15 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=285: vxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=285: vxxxxx#15 nsp=1 mass=0 - -3.394112549695428e-01, -5.656854249492381e-01, // itest=285: vxxxxx#15 nsp=1 mass=0 - -4.525483399593904e-01, 4.242640687119285e-01, // itest=285: vxxxxx#15 nsp=1 mass=0 - 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=285: vxxxxx#15 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=286: vxxxxx#15 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=286: vxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=286: vxxxxx#15 nsp=1 mass=0 - -3.394112549695428e-01, -5.656854249492381e-01, // itest=286: vxxxxx#15 nsp=1 mass=0 - -4.525483399593904e-01, 4.242640687119285e-01, // itest=286: vxxxxx#15 nsp=1 mass=0 - 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=286: vxxxxx#15 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=287: sxxxxx#15 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=287: sxxxxx#15 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=287: sxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=287: sxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=287: sxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=287: sxxxxx#15 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=288: sxxxxx#15 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=288: sxxxxx#15 nsp=1 mass=0 - 1.000000000000000e+00, 0.000000000000000e+00, // itest=288: sxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=288: sxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=288: sxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=288: sxxxxx#15 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=289: oxxxxx#15 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=289: oxxxxx#15 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=289: oxxxxx#15 nsp=1 mass=0 - 6.000000000000000e+00, -8.000000000000000e+00, // itest=289: oxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=289: oxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=289: oxxxxx#15 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=290: oxxxxx#15 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=290: oxxxxx#15 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=290: oxxxxx#15 nsp=1 mass=0 - 6.000000000000000e+00, -8.000000000000000e+00, // itest=290: oxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=290: oxxxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=290: oxxxxx#15 nsp=1 mass=0 - expwfs.push_back( { // --------- - 5.000000000000000e+02, 4.000000000000000e+02, // itest=291: oxzxxx#15 nsp=1 mass=0 - 1.800000000000000e+02, 2.400000000000000e+02, // itest=291: oxzxxx#15 nsp=1 mass=0 - 3.000000000000000e+01, 0.000000000000000e+00, // itest=291: oxzxxx#15 nsp=1 mass=0 - 6.000000000000000e+00, -8.000000000000000e+00, // itest=291: oxzxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00, // itest=291: oxzxxx#15 nsp=1 mass=0 - 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=291: oxzxxx#15 nsp=1 mass=0 + 5.000000000000000e+02, 5.000000000000000e+02, // itest=50: ixxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=50: ixxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=50: ixxxxx#5 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=51: ixxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=51: ixxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=51: ixxxxx#5 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=52: ipzxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=52: ipzxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=52: ipzxxx#5 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=52: ipzxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=52: ipzxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=52: ipzxxx#5 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=53: vxxxxx#5 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=53: vxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=53: vxxxxx#5 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=53: vxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=53: vxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=53: vxxxxx#5 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=54: vxxxxx#5 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=54: vxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=54: vxxxxx#5 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=54: vxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=54: vxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=54: vxxxxx#5 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=55: sxxxxx#5 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=55: sxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=55: sxxxxx#5 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=56: sxxxxx#5 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=56: sxxxxx#5 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=56: sxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=56: sxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=56: sxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=56: sxxxxx#5 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=57: oxxxxx#5 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=57: oxxxxx#5 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=57: oxxxxx#5 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=58: oxxxxx#5 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=58: oxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=58: oxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=58: oxxxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=58: oxxxxx#5 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=58: oxxxxx#5 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=59: opzxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=59: opzxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=59: opzxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=59: opzxxx#5 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=59: opzxxx#5 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=59: opzxxx#5 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=60: ixxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=60: ixxxxx#6 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=60: ixxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=60: ixxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=60: ixxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=60: ixxxxx#6 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=61: ixxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=61: ixxxxx#6 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=61: ixxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=61: ixxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=61: ixxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=61: ixxxxx#6 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=62: imzxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=62: imzxxx#6 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=62: imzxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=62: imzxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=62: imzxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=62: imzxxx#6 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=63: vxxxxx#6 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=63: vxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=63: vxxxxx#6 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=63: vxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=63: vxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=63: vxxxxx#6 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=64: vxxxxx#6 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=64: vxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=64: vxxxxx#6 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=64: vxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=64: vxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=64: vxxxxx#6 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=65: sxxxxx#6 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=65: sxxxxx#6 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=65: sxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=65: sxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=65: sxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=65: sxxxxx#6 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=66: sxxxxx#6 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=66: sxxxxx#6 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=66: sxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=66: sxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=66: sxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=66: sxxxxx#6 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=67: oxxxxx#6 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=67: oxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=67: oxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=67: oxxxxx#6 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=67: oxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=67: oxxxxx#6 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=68: oxxxxx#6 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=68: oxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=68: oxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=68: oxxxxx#6 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=68: oxxxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=68: oxxxxx#6 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=69: omzxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=69: omzxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=69: omzxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=69: omzxxx#6 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=69: omzxxx#6 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=69: omzxxx#6 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=70: ixxxxx#7 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=70: ixxxxx#7 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=70: ixxxxx#7 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=70: ixxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=70: ixxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=70: ixxxxx#7 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=71: ixxxxx#7 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=71: ixxxxx#7 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=71: ixxxxx#7 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=71: ixxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=71: ixxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=71: ixxxxx#7 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=72: ixzxxx#7 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=72: ixzxxx#7 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=72: ixzxxx#7 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=72: ixzxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=72: ixzxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=72: ixzxxx#7 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=73: vxxxxx#7 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=73: vxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=73: vxxxxx#7 nsp=-1 mass=0 + -0.000000000000000e+00, 5.656854249492381e-01, // itest=73: vxxxxx#7 nsp=-1 mass=0 + -0.000000000000000e+00, -4.242640687119285e-01, // itest=73: vxxxxx#7 nsp=-1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=73: vxxxxx#7 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=74: vxxxxx#7 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=74: vxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=74: vxxxxx#7 nsp=-1 mass=0 + -0.000000000000000e+00, 5.656854249492381e-01, // itest=74: vxxxxx#7 nsp=-1 mass=0 + -0.000000000000000e+00, -4.242640687119285e-01, // itest=74: vxxxxx#7 nsp=-1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=74: vxxxxx#7 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=75: sxxxxx#7 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=75: sxxxxx#7 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=75: sxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=75: sxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=75: sxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=75: sxxxxx#7 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=76: sxxxxx#7 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=76: sxxxxx#7 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=76: sxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=76: sxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=76: sxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=76: sxxxxx#7 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=77: oxxxxx#7 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=77: oxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=77: oxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=77: oxxxxx#7 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01, // itest=77: oxxxxx#7 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=77: oxxxxx#7 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=78: oxxxxx#7 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=78: oxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=78: oxxxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=78: oxxxxx#7 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01, // itest=78: oxxxxx#7 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=78: oxxxxx#7 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=79: oxzxxx#7 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=79: oxzxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=79: oxzxxx#7 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=79: oxzxxx#7 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01, // itest=79: oxzxxx#7 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=79: oxzxxx#7 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=80: ixxxxx#8 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=80: ixxxxx#8 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=80: ixxxxx#8 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=80: ixxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=80: ixxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=80: ixxxxx#8 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=81: ixxxxx#8 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=81: ixxxxx#8 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=81: ixxxxx#8 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=81: ixxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=81: ixxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=81: ixxxxx#8 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=82: ixzxxx#8 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=82: ixzxxx#8 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=82: ixzxxx#8 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=82: ixzxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=82: ixzxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=82: ixzxxx#8 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=83: vxxxxx#8 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=83: vxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=83: vxxxxx#8 nsp=-1 mass=0 + -3.394112549695428e-01, 5.656854249492381e-01, // itest=83: vxxxxx#8 nsp=-1 mass=0 + -4.525483399593904e-01, -4.242640687119285e-01, // itest=83: vxxxxx#8 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=83: vxxxxx#8 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=84: vxxxxx#8 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=84: vxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=84: vxxxxx#8 nsp=-1 mass=0 + -3.394112549695428e-01, 5.656854249492381e-01, // itest=84: vxxxxx#8 nsp=-1 mass=0 + -4.525483399593904e-01, -4.242640687119285e-01, // itest=84: vxxxxx#8 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=84: vxxxxx#8 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=85: sxxxxx#8 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=85: sxxxxx#8 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=85: sxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=85: sxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=85: sxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=85: sxxxxx#8 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=86: sxxxxx#8 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=86: sxxxxx#8 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=86: sxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=86: sxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=86: sxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=86: sxxxxx#8 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=87: oxxxxx#8 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=87: oxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=87: oxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=87: oxxxxx#8 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=87: oxxxxx#8 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=87: oxxxxx#8 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=88: oxxxxx#8 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=88: oxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=88: oxxxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=88: oxxxxx#8 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=88: oxxxxx#8 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=88: oxxxxx#8 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=89: oxzxxx#8 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=89: oxzxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=89: oxzxxx#8 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=89: oxzxxx#8 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=89: oxzxxx#8 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=89: oxzxxx#8 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=90: ixxxxx#9 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=90: ixxxxx#9 nsp=-1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=90: ixxxxx#9 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00, // itest=90: ixxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=90: ixxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=90: ixxxxx#9 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=91: ixxxxx#9 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=91: ixxxxx#9 nsp=-1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=91: ixxxxx#9 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00, // itest=91: ixxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=91: ixxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=91: ixxxxx#9 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=92: ixzxxx#9 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=92: ixzxxx#9 nsp=-1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=92: ixzxxx#9 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00, // itest=92: ixzxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=92: ixzxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=92: ixzxxx#9 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=93: vxxxxx#9 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=93: vxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=93: vxxxxx#9 nsp=-1 mass=0 + 3.394112549695428e-01, 5.656854249492381e-01, // itest=93: vxxxxx#9 nsp=-1 mass=0 + 4.525483399593904e-01, -4.242640687119285e-01, // itest=93: vxxxxx#9 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=93: vxxxxx#9 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=94: vxxxxx#9 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=94: vxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=94: vxxxxx#9 nsp=-1 mass=0 + 3.394112549695428e-01, 5.656854249492381e-01, // itest=94: vxxxxx#9 nsp=-1 mass=0 + 4.525483399593904e-01, -4.242640687119285e-01, // itest=94: vxxxxx#9 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=94: vxxxxx#9 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=95: sxxxxx#9 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=95: sxxxxx#9 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=95: sxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=95: sxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=95: sxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=95: sxxxxx#9 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=96: sxxxxx#9 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=96: sxxxxx#9 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=96: sxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=96: sxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=96: sxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=96: sxxxxx#9 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=97: oxxxxx#9 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=97: oxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=97: oxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=97: oxxxxx#9 nsp=-1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01, // itest=97: oxxxxx#9 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=97: oxxxxx#9 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=98: oxxxxx#9 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=98: oxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=98: oxxxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=98: oxxxxx#9 nsp=-1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01, // itest=98: oxxxxx#9 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=98: oxxxxx#9 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=99: oxzxxx#9 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=99: oxzxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=99: oxzxxx#9 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=99: oxzxxx#9 nsp=-1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01, // itest=99: oxzxxx#9 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=99: oxzxxx#9 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=100: ixxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=100: ixxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=100: ixxxxx#10 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=100: ixxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=100: ixxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=100: ixxxxx#10 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=101: ixxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=101: ixxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=101: ixxxxx#10 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=101: ixxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=101: ixxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=101: ixxxxx#10 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=102: ipzxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=102: ipzxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=102: ipzxxx#10 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=102: ipzxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=102: ipzxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=102: ipzxxx#10 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=103: vxxxxx#10 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=103: vxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=103: vxxxxx#10 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=103: vxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=103: vxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=103: vxxxxx#10 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=104: vxxxxx#10 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=104: vxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=104: vxxxxx#10 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=104: vxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=104: vxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=104: vxxxxx#10 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=105: sxxxxx#10 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=105: sxxxxx#10 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=105: sxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=105: sxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=105: sxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=105: sxxxxx#10 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=106: sxxxxx#10 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=106: sxxxxx#10 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=106: sxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=106: sxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=106: sxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=106: sxxxxx#10 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=107: oxxxxx#10 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=107: oxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=107: oxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=107: oxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=107: oxxxxx#10 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=107: oxxxxx#10 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=108: oxxxxx#10 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=108: oxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=108: oxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=108: oxxxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=108: oxxxxx#10 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=108: oxxxxx#10 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=109: opzxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=109: opzxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=109: opzxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=109: opzxxx#10 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=109: opzxxx#10 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=109: opzxxx#10 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=110: ixxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=110: ixxxxx#11 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=110: ixxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=110: ixxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=110: ixxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=110: ixxxxx#11 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=111: ixxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=111: ixxxxx#11 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=111: ixxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=111: ixxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=111: ixxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=111: ixxxxx#11 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=112: imzxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=112: imzxxx#11 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=112: imzxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=112: imzxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=112: imzxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=112: imzxxx#11 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=113: vxxxxx#11 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=113: vxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=113: vxxxxx#11 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=113: vxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=113: vxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=113: vxxxxx#11 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=114: vxxxxx#11 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=114: vxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=114: vxxxxx#11 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=114: vxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=114: vxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=114: vxxxxx#11 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=115: sxxxxx#11 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=115: sxxxxx#11 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=115: sxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=115: sxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=115: sxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=115: sxxxxx#11 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=116: sxxxxx#11 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=116: sxxxxx#11 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=116: sxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=116: sxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=116: sxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=116: sxxxxx#11 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=117: oxxxxx#11 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=117: oxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=117: oxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=117: oxxxxx#11 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=117: oxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=117: oxxxxx#11 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=118: oxxxxx#11 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=118: oxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=118: oxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=118: oxxxxx#11 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=118: oxxxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=118: oxxxxx#11 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=119: omzxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=119: omzxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=119: omzxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=119: omzxxx#11 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=119: omzxxx#11 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=119: omzxxx#11 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=120: ixxxxx#12 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=120: ixxxxx#12 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=120: ixxxxx#12 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=120: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=120: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=120: ixxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=121: ixxxxx#12 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=121: ixxxxx#12 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=121: ixxxxx#12 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=121: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=121: ixxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=121: ixxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=122: ixzxxx#12 nsp=-1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=122: ixzxxx#12 nsp=-1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=122: ixzxxx#12 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=122: ixzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=122: ixzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=122: ixzxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=123: vxxxxx#12 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=123: vxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=123: vxxxxx#12 nsp=-1 mass=0 + -0.000000000000000e+00, 5.656854249492381e-01, // itest=123: vxxxxx#12 nsp=-1 mass=0 + -0.000000000000000e+00, -4.242640687119285e-01, // itest=123: vxxxxx#12 nsp=-1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=123: vxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=124: vxxxxx#12 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=124: vxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=124: vxxxxx#12 nsp=-1 mass=0 + -0.000000000000000e+00, 5.656854249492381e-01, // itest=124: vxxxxx#12 nsp=-1 mass=0 + -0.000000000000000e+00, -4.242640687119285e-01, // itest=124: vxxxxx#12 nsp=-1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=124: vxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=125: sxxxxx#12 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=125: sxxxxx#12 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=125: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=125: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=125: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=125: sxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=126: sxxxxx#12 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=126: sxxxxx#12 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=126: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=126: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=126: sxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=126: sxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=127: oxxxxx#12 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=127: oxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=127: oxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=127: oxxxxx#12 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01, // itest=127: oxxxxx#12 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=127: oxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=128: oxxxxx#12 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=128: oxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=128: oxxxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=128: oxxxxx#12 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01, // itest=128: oxxxxx#12 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=128: oxxxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=129: oxzxxx#12 nsp=-1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=129: oxzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=129: oxzxxx#12 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=129: oxzxxx#12 nsp=-1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01, // itest=129: oxzxxx#12 nsp=-1 mass=0 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=129: oxzxxx#12 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=130: ixxxxx#13 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=130: ixxxxx#13 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=130: ixxxxx#13 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=130: ixxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=130: ixxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=130: ixxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=131: ixxxxx#13 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=131: ixxxxx#13 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=131: ixxxxx#13 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=131: ixxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=131: ixxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=131: ixxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=132: ixzxxx#13 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=132: ixzxxx#13 nsp=-1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=132: ixzxxx#13 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00, // itest=132: ixzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=132: ixzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=132: ixzxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=133: vxxxxx#13 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=133: vxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=133: vxxxxx#13 nsp=-1 mass=0 + -3.394112549695428e-01, 5.656854249492381e-01, // itest=133: vxxxxx#13 nsp=-1 mass=0 + -4.525483399593904e-01, -4.242640687119285e-01, // itest=133: vxxxxx#13 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=133: vxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=134: vxxxxx#13 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=134: vxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=134: vxxxxx#13 nsp=-1 mass=0 + -3.394112549695428e-01, 5.656854249492381e-01, // itest=134: vxxxxx#13 nsp=-1 mass=0 + -4.525483399593904e-01, -4.242640687119285e-01, // itest=134: vxxxxx#13 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=134: vxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=135: sxxxxx#13 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=135: sxxxxx#13 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=135: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=135: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=135: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=135: sxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=136: sxxxxx#13 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=136: sxxxxx#13 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=136: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=136: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=136: sxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=136: sxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=137: oxxxxx#13 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=137: oxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=137: oxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=137: oxxxxx#13 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=137: oxxxxx#13 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=137: oxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=138: oxxxxx#13 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=138: oxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=138: oxxxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=138: oxxxxx#13 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=138: oxxxxx#13 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=138: oxxxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=139: oxzxxx#13 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=139: oxzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=139: oxzxxx#13 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=139: oxzxxx#13 nsp=-1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00, // itest=139: oxzxxx#13 nsp=-1 mass=0 + -3.000000000000000e+01, 0.000000000000000e+00 } ); // itest=139: oxzxxx#13 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=140: ixxxxx#14 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=140: ixxxxx#14 nsp=-1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=140: ixxxxx#14 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00, // itest=140: ixxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=140: ixxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=140: ixxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=141: ixxxxx#14 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=141: ixxxxx#14 nsp=-1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=141: ixxxxx#14 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00, // itest=141: ixxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=141: ixxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=141: ixxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=142: ixzxxx#14 nsp=-1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=142: ixzxxx#14 nsp=-1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=142: ixzxxx#14 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00, // itest=142: ixzxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=142: ixzxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=142: ixzxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=143: vxxxxx#14 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=143: vxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=143: vxxxxx#14 nsp=-1 mass=0 + 3.394112549695428e-01, 5.656854249492381e-01, // itest=143: vxxxxx#14 nsp=-1 mass=0 + 4.525483399593904e-01, -4.242640687119285e-01, // itest=143: vxxxxx#14 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=143: vxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=144: vxxxxx#14 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=144: vxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=144: vxxxxx#14 nsp=-1 mass=0 + 3.394112549695428e-01, 5.656854249492381e-01, // itest=144: vxxxxx#14 nsp=-1 mass=0 + 4.525483399593904e-01, -4.242640687119285e-01, // itest=144: vxxxxx#14 nsp=-1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=144: vxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=145: sxxxxx#14 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=145: sxxxxx#14 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=145: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=145: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=145: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=145: sxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=146: sxxxxx#14 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=146: sxxxxx#14 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=146: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=146: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=146: sxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=146: sxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=147: oxxxxx#14 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=147: oxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=147: oxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=147: oxxxxx#14 nsp=-1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01, // itest=147: oxxxxx#14 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=147: oxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=148: oxxxxx#14 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=148: oxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=148: oxxxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=148: oxxxxx#14 nsp=-1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01, // itest=148: oxxxxx#14 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=148: oxxxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=149: oxzxxx#14 nsp=-1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=149: oxzxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=149: oxzxxx#14 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=149: oxzxxx#14 nsp=-1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01, // itest=149: oxzxxx#14 nsp=-1 mass=0 + -1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=149: oxzxxx#14 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=150: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=150: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=150: ixxxxx#15 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=150: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=150: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=150: ixxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=151: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=151: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=151: ixxxxx#15 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=151: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=151: ixxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=151: ixxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=152: ipzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=152: ipzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=152: ipzxxx#15 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=152: ipzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=152: ipzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=152: ipzxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=153: vxxxxx#15 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=153: vxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=153: vxxxxx#15 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=153: vxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=153: vxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=153: vxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=154: vxxxxx#15 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=154: vxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=154: vxxxxx#15 nsp=-1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=154: vxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=154: vxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=154: vxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=155: sxxxxx#15 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=155: sxxxxx#15 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=155: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=155: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=155: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=155: sxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=156: sxxxxx#15 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=156: sxxxxx#15 nsp=-1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=156: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=156: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=156: sxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=156: sxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=157: oxxxxx#15 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=157: oxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=157: oxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=157: oxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=157: oxxxxx#15 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=157: oxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=158: oxxxxx#15 nsp=-1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=158: oxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=158: oxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=158: oxxxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=158: oxxxxx#15 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=158: oxxxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=159: opzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=159: opzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=159: opzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=159: opzxxx#15 nsp=-1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=159: opzxxx#15 nsp=-1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=159: opzxxx#15 nsp=-1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=160: ixxxxx#16 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=160: ixxxxx#16 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=160: ixxxxx#16 nsp=-1 mass=500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=160: ixxxxx#16 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=160: ixxxxx#16 nsp=-1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=160: ixxxxx#16 nsp=-1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=161: ixxxxx#16 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=161: ixxxxx#16 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=161: ixxxxx#16 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=161: ixxxxx#16 nsp=-1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=161: ixxxxx#16 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=161: ixxxxx#16 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=162: vxxxxx#16 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=162: vxxxxx#16 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=162: vxxxxx#16 nsp=-1 mass=500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=162: vxxxxx#16 nsp=-1 mass=500 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=162: vxxxxx#16 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=162: vxxxxx#16 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=163: vxxxxx#16 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=163: vxxxxx#16 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=163: vxxxxx#16 nsp=-1 mass=-500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=163: vxxxxx#16 nsp=-1 mass=-500 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=163: vxxxxx#16 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=163: vxxxxx#16 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=164: sxxxxx#16 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=164: sxxxxx#16 nsp=-1 mass=500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=164: sxxxxx#16 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=164: sxxxxx#16 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=164: sxxxxx#16 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=164: sxxxxx#16 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=165: sxxxxx#16 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=165: sxxxxx#16 nsp=-1 mass=-500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=165: sxxxxx#16 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=165: sxxxxx#16 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=165: sxxxxx#16 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=165: sxxxxx#16 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=166: oxxxxx#16 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=166: oxxxxx#16 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=166: oxxxxx#16 nsp=-1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=166: oxxxxx#16 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=166: oxxxxx#16 nsp=-1 mass=500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=166: oxxxxx#16 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=167: oxxxxx#16 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=167: oxxxxx#16 nsp=-1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=167: oxxxxx#16 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=167: oxxxxx#16 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=167: oxxxxx#16 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=167: oxxxxx#16 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=168: ixxxxx#17 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=168: ixxxxx#17 nsp=-1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=168: ixxxxx#17 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=168: ixxxxx#17 nsp=-1 mass=400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=168: ixxxxx#17 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00 } ); // itest=168: ixxxxx#17 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=169: ixxxxx#17 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=169: ixxxxx#17 nsp=-1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=169: ixxxxx#17 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=169: ixxxxx#17 nsp=-1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=169: ixxxxx#17 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=169: ixxxxx#17 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=170: vxxxxx#17 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=170: vxxxxx#17 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=170: vxxxxx#17 nsp=-1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=170: vxxxxx#17 nsp=-1 mass=400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=170: vxxxxx#17 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=170: vxxxxx#17 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=171: vxxxxx#17 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=171: vxxxxx#17 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=171: vxxxxx#17 nsp=-1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=171: vxxxxx#17 nsp=-1 mass=-400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=171: vxxxxx#17 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=171: vxxxxx#17 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=172: sxxxxx#17 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=172: sxxxxx#17 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=172: sxxxxx#17 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=172: sxxxxx#17 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=172: sxxxxx#17 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=172: sxxxxx#17 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=173: sxxxxx#17 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=173: sxxxxx#17 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=173: sxxxxx#17 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=173: sxxxxx#17 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=173: sxxxxx#17 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=173: sxxxxx#17 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=174: oxxxxx#17 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=174: oxxxxx#17 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=174: oxxxxx#17 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=174: oxxxxx#17 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=174: oxxxxx#17 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=174: oxxxxx#17 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=175: oxxxxx#17 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=175: oxxxxx#17 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=175: oxxxxx#17 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=175: oxxxxx#17 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=175: oxxxxx#17 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=175: oxxxxx#17 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=176: ixxxxx#18 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=176: ixxxxx#18 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=176: ixxxxx#18 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=176: ixxxxx#18 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=176: ixxxxx#18 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=176: ixxxxx#18 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=177: ixxxxx#18 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=177: ixxxxx#18 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=177: ixxxxx#18 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=177: ixxxxx#18 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=177: ixxxxx#18 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=177: ixxxxx#18 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=178: vxxxxx#18 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=178: vxxxxx#18 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=178: vxxxxx#18 nsp=-1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=178: vxxxxx#18 nsp=-1 mass=400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=178: vxxxxx#18 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=178: vxxxxx#18 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=179: vxxxxx#18 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=179: vxxxxx#18 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=179: vxxxxx#18 nsp=-1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=179: vxxxxx#18 nsp=-1 mass=-400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=179: vxxxxx#18 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=179: vxxxxx#18 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=180: sxxxxx#18 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=180: sxxxxx#18 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=180: sxxxxx#18 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=180: sxxxxx#18 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=180: sxxxxx#18 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=180: sxxxxx#18 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=181: sxxxxx#18 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=181: sxxxxx#18 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=181: sxxxxx#18 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=181: sxxxxx#18 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=181: sxxxxx#18 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=181: sxxxxx#18 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=182: oxxxxx#18 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=182: oxxxxx#18 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=182: oxxxxx#18 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=182: oxxxxx#18 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=182: oxxxxx#18 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=182: oxxxxx#18 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=183: oxxxxx#18 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=183: oxxxxx#18 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=183: oxxxxx#18 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=183: oxxxxx#18 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=183: oxxxxx#18 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=183: oxxxxx#18 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=184: ixxxxx#19 nsp=-1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=184: ixxxxx#19 nsp=-1 mass=400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=184: ixxxxx#19 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=184: ixxxxx#19 nsp=-1 mass=400 + -5.999999999999999e+00, 7.999999999999999e+00, // itest=184: ixxxxx#19 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=184: ixxxxx#19 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=185: ixxxxx#19 nsp=-1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=185: ixxxxx#19 nsp=-1 mass=-400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=185: ixxxxx#19 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=185: ixxxxx#19 nsp=-1 mass=-400 + 5.999999999999999e+00, -7.999999999999999e+00, // itest=185: ixxxxx#19 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=185: ixxxxx#19 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=186: vxxxxx#19 nsp=-1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=186: vxxxxx#19 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=186: vxxxxx#19 nsp=-1 mass=400 + 0.000000000000000e+00, 5.656854249492381e-01, // itest=186: vxxxxx#19 nsp=-1 mass=400 + 0.000000000000000e+00, -4.242640687119285e-01, // itest=186: vxxxxx#19 nsp=-1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=186: vxxxxx#19 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=187: vxxxxx#19 nsp=-1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=187: vxxxxx#19 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=187: vxxxxx#19 nsp=-1 mass=-400 + -0.000000000000000e+00, 5.656854249492381e-01, // itest=187: vxxxxx#19 nsp=-1 mass=-400 + -0.000000000000000e+00, -4.242640687119285e-01, // itest=187: vxxxxx#19 nsp=-1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=187: vxxxxx#19 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=188: sxxxxx#19 nsp=-1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=188: sxxxxx#19 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=188: sxxxxx#19 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=188: sxxxxx#19 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=188: sxxxxx#19 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=188: sxxxxx#19 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=189: sxxxxx#19 nsp=-1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=189: sxxxxx#19 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=189: sxxxxx#19 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=189: sxxxxx#19 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=189: sxxxxx#19 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=189: sxxxxx#19 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=190: oxxxxx#19 nsp=-1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=190: oxxxxx#19 nsp=-1 mass=400 + -5.999999999999999e+00, -7.999999999999999e+00, // itest=190: oxxxxx#19 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=190: oxxxxx#19 nsp=-1 mass=400 + 1.200000000000000e+01, 1.600000000000000e+01, // itest=190: oxxxxx#19 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=190: oxxxxx#19 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=191: oxxxxx#19 nsp=-1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=191: oxxxxx#19 nsp=-1 mass=-400 + 5.999999999999999e+00, 7.999999999999999e+00, // itest=191: oxxxxx#19 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=191: oxxxxx#19 nsp=-1 mass=-400 + 1.200000000000000e+01, 1.600000000000000e+01, // itest=191: oxxxxx#19 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=191: oxxxxx#19 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=192: ixxxxx#20 nsp=-1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=192: ixxxxx#20 nsp=-1 mass=400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=192: ixxxxx#20 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=192: ixxxxx#20 nsp=-1 mass=400 + 7.999999999999999e+00, -5.999999999999999e+00, // itest=192: ixxxxx#20 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=192: ixxxxx#20 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=193: ixxxxx#20 nsp=-1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=193: ixxxxx#20 nsp=-1 mass=-400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=193: ixxxxx#20 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=193: ixxxxx#20 nsp=-1 mass=-400 + -7.999999999999999e+00, 5.999999999999999e+00, // itest=193: ixxxxx#20 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=193: ixxxxx#20 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=194: vxxxxx#20 nsp=-1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=194: vxxxxx#20 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=194: vxxxxx#20 nsp=-1 mass=400 + 0.000000000000000e+00, -4.242640687119285e-01, // itest=194: vxxxxx#20 nsp=-1 mass=400 + 0.000000000000000e+00, 5.656854249492381e-01, // itest=194: vxxxxx#20 nsp=-1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=194: vxxxxx#20 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=195: vxxxxx#20 nsp=-1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=195: vxxxxx#20 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=195: vxxxxx#20 nsp=-1 mass=-400 + 0.000000000000000e+00, -4.242640687119285e-01, // itest=195: vxxxxx#20 nsp=-1 mass=-400 + 0.000000000000000e+00, 5.656854249492381e-01, // itest=195: vxxxxx#20 nsp=-1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=195: vxxxxx#20 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=196: sxxxxx#20 nsp=-1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=196: sxxxxx#20 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=196: sxxxxx#20 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=196: sxxxxx#20 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=196: sxxxxx#20 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=196: sxxxxx#20 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=197: sxxxxx#20 nsp=-1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=197: sxxxxx#20 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=197: sxxxxx#20 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=197: sxxxxx#20 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=197: sxxxxx#20 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=197: sxxxxx#20 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=198: oxxxxx#20 nsp=-1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=198: oxxxxx#20 nsp=-1 mass=400 + 7.999999999999999e+00, 5.999999999999999e+00, // itest=198: oxxxxx#20 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=198: oxxxxx#20 nsp=-1 mass=400 + -1.600000000000000e+01, -1.200000000000000e+01, // itest=198: oxxxxx#20 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=198: oxxxxx#20 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=199: oxxxxx#20 nsp=-1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=199: oxxxxx#20 nsp=-1 mass=-400 + -7.999999999999999e+00, -5.999999999999999e+00, // itest=199: oxxxxx#20 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=199: oxxxxx#20 nsp=-1 mass=-400 + -1.600000000000000e+01, -1.200000000000000e+01, // itest=199: oxxxxx#20 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=199: oxxxxx#20 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=200: ixxxxx#21 nsp=-1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=200: ixxxxx#21 nsp=-1 mass=400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=200: ixxxxx#21 nsp=-1 mass=400 + -2.433105012119288e+01, -0.000000000000000e+00, // itest=200: ixxxxx#21 nsp=-1 mass=400 + -4.931969619160719e+00, 5.260767593771432e+00, // itest=200: ixxxxx#21 nsp=-1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00 } ); // itest=200: ixxxxx#21 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=201: ixxxxx#21 nsp=-1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=201: ixxxxx#21 nsp=-1 mass=-400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=201: ixxxxx#21 nsp=-1 mass=-400 + -2.433105012119288e+01, -0.000000000000000e+00, // itest=201: ixxxxx#21 nsp=-1 mass=-400 + 4.931969619160719e+00, -5.260767593771432e+00, // itest=201: ixxxxx#21 nsp=-1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00 } ); // itest=201: ixxxxx#21 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=202: vxxxxx#21 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=202: vxxxxx#21 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=202: vxxxxx#21 nsp=-1 mass=400 + -2.321373168788980e-01, 5.158607041753289e-01, // itest=202: vxxxxx#21 nsp=-1 mass=400 + -2.476131380041579e-01, -4.836194101643708e-01, // itest=202: vxxxxx#21 nsp=-1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=202: vxxxxx#21 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=203: vxxxxx#21 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=203: vxxxxx#21 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=203: vxxxxx#21 nsp=-1 mass=-400 + -2.321373168788980e-01, 5.158607041753289e-01, // itest=203: vxxxxx#21 nsp=-1 mass=-400 + -2.476131380041579e-01, -4.836194101643708e-01, // itest=203: vxxxxx#21 nsp=-1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=203: vxxxxx#21 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=204: sxxxxx#21 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=204: sxxxxx#21 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=204: sxxxxx#21 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=204: sxxxxx#21 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=204: sxxxxx#21 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=204: sxxxxx#21 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=205: sxxxxx#21 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=205: sxxxxx#21 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=205: sxxxxx#21 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=205: sxxxxx#21 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=205: sxxxxx#21 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=205: sxxxxx#21 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=206: oxxxxx#21 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=206: oxxxxx#21 nsp=-1 mass=400 + -4.931969619160719e+00, -5.260767593771432e+00, // itest=206: oxxxxx#21 nsp=-1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00, // itest=206: oxxxxx#21 nsp=-1 mass=400 + 9.863939238321439e+00, 1.052153518754287e+01, // itest=206: oxxxxx#21 nsp=-1 mass=400 + -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=206: oxxxxx#21 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=207: oxxxxx#21 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=207: oxxxxx#21 nsp=-1 mass=-400 + 4.931969619160719e+00, 5.260767593771432e+00, // itest=207: oxxxxx#21 nsp=-1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00, // itest=207: oxxxxx#21 nsp=-1 mass=-400 + 9.863939238321439e+00, 1.052153518754287e+01, // itest=207: oxxxxx#21 nsp=-1 mass=-400 + -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=207: oxxxxx#21 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=208: ixxxxx#22 nsp=-1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=208: ixxxxx#22 nsp=-1 mass=400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=208: ixxxxx#22 nsp=-1 mass=400 + -1.442220510185596e+01, -0.000000000000000e+00, // itest=208: ixxxxx#22 nsp=-1 mass=400 + -8.320502943378436e+00, 8.875203139603666e+00, // itest=208: ixxxxx#22 nsp=-1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00 } ); // itest=208: ixxxxx#22 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=209: ixxxxx#22 nsp=-1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=209: ixxxxx#22 nsp=-1 mass=-400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=209: ixxxxx#22 nsp=-1 mass=-400 + -1.442220510185596e+01, -0.000000000000000e+00, // itest=209: ixxxxx#22 nsp=-1 mass=-400 + 8.320502943378436e+00, -8.875203139603666e+00, // itest=209: ixxxxx#22 nsp=-1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00 } ); // itest=209: ixxxxx#22 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=210: vxxxxx#22 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=210: vxxxxx#22 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=210: vxxxxx#22 nsp=-1 mass=400 + 2.321373168788980e-01, 5.158607041753289e-01, // itest=210: vxxxxx#22 nsp=-1 mass=400 + 2.476131380041579e-01, -4.836194101643708e-01, // itest=210: vxxxxx#22 nsp=-1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=210: vxxxxx#22 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=211: vxxxxx#22 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=211: vxxxxx#22 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=211: vxxxxx#22 nsp=-1 mass=-400 + 2.321373168788980e-01, 5.158607041753289e-01, // itest=211: vxxxxx#22 nsp=-1 mass=-400 + 2.476131380041579e-01, -4.836194101643708e-01, // itest=211: vxxxxx#22 nsp=-1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=211: vxxxxx#22 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=212: sxxxxx#22 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=212: sxxxxx#22 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=212: sxxxxx#22 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=212: sxxxxx#22 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=212: sxxxxx#22 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=212: sxxxxx#22 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=213: sxxxxx#22 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=213: sxxxxx#22 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=213: sxxxxx#22 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=213: sxxxxx#22 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=213: sxxxxx#22 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=213: sxxxxx#22 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=214: oxxxxx#22 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=214: oxxxxx#22 nsp=-1 mass=400 + -8.320502943378436e+00, -8.875203139603666e+00, // itest=214: oxxxxx#22 nsp=-1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00, // itest=214: oxxxxx#22 nsp=-1 mass=400 + 1.664100588675688e+01, 1.775040627920733e+01, // itest=214: oxxxxx#22 nsp=-1 mass=400 + -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=214: oxxxxx#22 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=215: oxxxxx#22 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=215: oxxxxx#22 nsp=-1 mass=-400 + 8.320502943378436e+00, 8.875203139603666e+00, // itest=215: oxxxxx#22 nsp=-1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00, // itest=215: oxxxxx#22 nsp=-1 mass=-400 + 1.664100588675688e+01, 1.775040627920733e+01, // itest=215: oxxxxx#22 nsp=-1 mass=-400 + -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=215: oxxxxx#22 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=216: ixxxxx#23 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=216: ixxxxx#23 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=216: ixxxxx#23 nsp=-1 mass=500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=216: ixxxxx#23 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=216: ixxxxx#23 nsp=-1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=216: ixxxxx#23 nsp=-1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=217: ixxxxx#23 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=217: ixxxxx#23 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=217: ixxxxx#23 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=217: ixxxxx#23 nsp=-1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=217: ixxxxx#23 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=217: ixxxxx#23 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=218: vxxxxx#23 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=218: vxxxxx#23 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=218: vxxxxx#23 nsp=-1 mass=500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=218: vxxxxx#23 nsp=-1 mass=500 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=218: vxxxxx#23 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=218: vxxxxx#23 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=219: vxxxxx#23 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=219: vxxxxx#23 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=219: vxxxxx#23 nsp=-1 mass=-500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=219: vxxxxx#23 nsp=-1 mass=-500 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=219: vxxxxx#23 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=219: vxxxxx#23 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=220: sxxxxx#23 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=220: sxxxxx#23 nsp=-1 mass=500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=220: sxxxxx#23 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=220: sxxxxx#23 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=220: sxxxxx#23 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=220: sxxxxx#23 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=221: sxxxxx#23 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=221: sxxxxx#23 nsp=-1 mass=-500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=221: sxxxxx#23 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=221: sxxxxx#23 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=221: sxxxxx#23 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=221: sxxxxx#23 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=222: oxxxxx#23 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=222: oxxxxx#23 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=222: oxxxxx#23 nsp=-1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=222: oxxxxx#23 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=222: oxxxxx#23 nsp=-1 mass=500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=222: oxxxxx#23 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=223: oxxxxx#23 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=223: oxxxxx#23 nsp=-1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=223: oxxxxx#23 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=223: oxxxxx#23 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=223: oxxxxx#23 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=223: oxxxxx#23 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=224: ixxxxx#24 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=224: ixxxxx#24 nsp=-1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=224: ixxxxx#24 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=224: ixxxxx#24 nsp=-1 mass=400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=224: ixxxxx#24 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00 } ); // itest=224: ixxxxx#24 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=225: ixxxxx#24 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=225: ixxxxx#24 nsp=-1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=225: ixxxxx#24 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=225: ixxxxx#24 nsp=-1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=225: ixxxxx#24 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=225: ixxxxx#24 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=226: vxxxxx#24 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=226: vxxxxx#24 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=226: vxxxxx#24 nsp=-1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=226: vxxxxx#24 nsp=-1 mass=400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=226: vxxxxx#24 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=226: vxxxxx#24 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=227: vxxxxx#24 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=227: vxxxxx#24 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=227: vxxxxx#24 nsp=-1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=227: vxxxxx#24 nsp=-1 mass=-400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=227: vxxxxx#24 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=227: vxxxxx#24 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=228: sxxxxx#24 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=228: sxxxxx#24 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=228: sxxxxx#24 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=228: sxxxxx#24 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=228: sxxxxx#24 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=228: sxxxxx#24 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=229: sxxxxx#24 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=229: sxxxxx#24 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=229: sxxxxx#24 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=229: sxxxxx#24 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=229: sxxxxx#24 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=229: sxxxxx#24 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=230: oxxxxx#24 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=230: oxxxxx#24 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=230: oxxxxx#24 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=230: oxxxxx#24 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=230: oxxxxx#24 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=230: oxxxxx#24 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=231: oxxxxx#24 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=231: oxxxxx#24 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=231: oxxxxx#24 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=231: oxxxxx#24 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=231: oxxxxx#24 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=231: oxxxxx#24 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=232: ixxxxx#25 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=232: ixxxxx#25 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=232: ixxxxx#25 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=232: ixxxxx#25 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=232: ixxxxx#25 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=232: ixxxxx#25 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=233: ixxxxx#25 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=233: ixxxxx#25 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=233: ixxxxx#25 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=233: ixxxxx#25 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=233: ixxxxx#25 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=233: ixxxxx#25 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=234: vxxxxx#25 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=234: vxxxxx#25 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=234: vxxxxx#25 nsp=-1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=234: vxxxxx#25 nsp=-1 mass=400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=234: vxxxxx#25 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=234: vxxxxx#25 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=235: vxxxxx#25 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=235: vxxxxx#25 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=235: vxxxxx#25 nsp=-1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=235: vxxxxx#25 nsp=-1 mass=-400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=235: vxxxxx#25 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=235: vxxxxx#25 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=236: sxxxxx#25 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=236: sxxxxx#25 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=236: sxxxxx#25 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=236: sxxxxx#25 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=236: sxxxxx#25 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=236: sxxxxx#25 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=237: sxxxxx#25 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=237: sxxxxx#25 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=237: sxxxxx#25 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=237: sxxxxx#25 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=237: sxxxxx#25 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=237: sxxxxx#25 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=238: oxxxxx#25 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=238: oxxxxx#25 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=238: oxxxxx#25 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=238: oxxxxx#25 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=238: oxxxxx#25 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=238: oxxxxx#25 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=239: oxxxxx#25 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=239: oxxxxx#25 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=239: oxxxxx#25 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=239: oxxxxx#25 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=239: oxxxxx#25 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=239: oxxxxx#25 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=240: ixxxxx#26 nsp=-1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=240: ixxxxx#26 nsp=-1 mass=400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=240: ixxxxx#26 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=240: ixxxxx#26 nsp=-1 mass=400 + -5.999999999999999e+00, 7.999999999999999e+00, // itest=240: ixxxxx#26 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=240: ixxxxx#26 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=241: ixxxxx#26 nsp=-1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=241: ixxxxx#26 nsp=-1 mass=-400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=241: ixxxxx#26 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=241: ixxxxx#26 nsp=-1 mass=-400 + 5.999999999999999e+00, -7.999999999999999e+00, // itest=241: ixxxxx#26 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=241: ixxxxx#26 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=242: vxxxxx#26 nsp=-1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=242: vxxxxx#26 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=242: vxxxxx#26 nsp=-1 mass=400 + 0.000000000000000e+00, 5.656854249492381e-01, // itest=242: vxxxxx#26 nsp=-1 mass=400 + 0.000000000000000e+00, -4.242640687119285e-01, // itest=242: vxxxxx#26 nsp=-1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=242: vxxxxx#26 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=243: vxxxxx#26 nsp=-1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=243: vxxxxx#26 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=243: vxxxxx#26 nsp=-1 mass=-400 + -0.000000000000000e+00, 5.656854249492381e-01, // itest=243: vxxxxx#26 nsp=-1 mass=-400 + -0.000000000000000e+00, -4.242640687119285e-01, // itest=243: vxxxxx#26 nsp=-1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=243: vxxxxx#26 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=244: sxxxxx#26 nsp=-1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=244: sxxxxx#26 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=244: sxxxxx#26 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=244: sxxxxx#26 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=244: sxxxxx#26 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=244: sxxxxx#26 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=245: sxxxxx#26 nsp=-1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=245: sxxxxx#26 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=245: sxxxxx#26 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=245: sxxxxx#26 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=245: sxxxxx#26 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=245: sxxxxx#26 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=246: oxxxxx#26 nsp=-1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=246: oxxxxx#26 nsp=-1 mass=400 + -5.999999999999999e+00, -7.999999999999999e+00, // itest=246: oxxxxx#26 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=246: oxxxxx#26 nsp=-1 mass=400 + 1.200000000000000e+01, 1.600000000000000e+01, // itest=246: oxxxxx#26 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=246: oxxxxx#26 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=247: oxxxxx#26 nsp=-1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=247: oxxxxx#26 nsp=-1 mass=-400 + 5.999999999999999e+00, 7.999999999999999e+00, // itest=247: oxxxxx#26 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=247: oxxxxx#26 nsp=-1 mass=-400 + 1.200000000000000e+01, 1.600000000000000e+01, // itest=247: oxxxxx#26 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=247: oxxxxx#26 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=248: ixxxxx#27 nsp=-1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=248: ixxxxx#27 nsp=-1 mass=400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=248: ixxxxx#27 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=248: ixxxxx#27 nsp=-1 mass=400 + 7.999999999999999e+00, -5.999999999999999e+00, // itest=248: ixxxxx#27 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00 } ); // itest=248: ixxxxx#27 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=249: ixxxxx#27 nsp=-1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=249: ixxxxx#27 nsp=-1 mass=-400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=249: ixxxxx#27 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00, // itest=249: ixxxxx#27 nsp=-1 mass=-400 + -7.999999999999999e+00, 5.999999999999999e+00, // itest=249: ixxxxx#27 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00 } ); // itest=249: ixxxxx#27 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=250: vxxxxx#27 nsp=-1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=250: vxxxxx#27 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=250: vxxxxx#27 nsp=-1 mass=400 + 0.000000000000000e+00, -4.242640687119285e-01, // itest=250: vxxxxx#27 nsp=-1 mass=400 + 0.000000000000000e+00, 5.656854249492381e-01, // itest=250: vxxxxx#27 nsp=-1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=250: vxxxxx#27 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=251: vxxxxx#27 nsp=-1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=251: vxxxxx#27 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=251: vxxxxx#27 nsp=-1 mass=-400 + 0.000000000000000e+00, -4.242640687119285e-01, // itest=251: vxxxxx#27 nsp=-1 mass=-400 + 0.000000000000000e+00, 5.656854249492381e-01, // itest=251: vxxxxx#27 nsp=-1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=251: vxxxxx#27 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=252: sxxxxx#27 nsp=-1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=252: sxxxxx#27 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=252: sxxxxx#27 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=252: sxxxxx#27 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=252: sxxxxx#27 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=252: sxxxxx#27 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=253: sxxxxx#27 nsp=-1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=253: sxxxxx#27 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=253: sxxxxx#27 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=253: sxxxxx#27 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=253: sxxxxx#27 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=253: sxxxxx#27 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=254: oxxxxx#27 nsp=-1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=254: oxxxxx#27 nsp=-1 mass=400 + 7.999999999999999e+00, 5.999999999999999e+00, // itest=254: oxxxxx#27 nsp=-1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=254: oxxxxx#27 nsp=-1 mass=400 + -1.600000000000000e+01, -1.200000000000000e+01, // itest=254: oxxxxx#27 nsp=-1 mass=400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=254: oxxxxx#27 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=255: oxxxxx#27 nsp=-1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=255: oxxxxx#27 nsp=-1 mass=-400 + -7.999999999999999e+00, -5.999999999999999e+00, // itest=255: oxxxxx#27 nsp=-1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=255: oxxxxx#27 nsp=-1 mass=-400 + -1.600000000000000e+01, -1.200000000000000e+01, // itest=255: oxxxxx#27 nsp=-1 mass=-400 + -2.000000000000000e+01, -0.000000000000000e+00 } ); // itest=255: oxxxxx#27 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=256: ixxxxx#28 nsp=-1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=256: ixxxxx#28 nsp=-1 mass=400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=256: ixxxxx#28 nsp=-1 mass=400 + -2.433105012119288e+01, -0.000000000000000e+00, // itest=256: ixxxxx#28 nsp=-1 mass=400 + -4.931969619160719e+00, 5.260767593771432e+00, // itest=256: ixxxxx#28 nsp=-1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00 } ); // itest=256: ixxxxx#28 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=257: ixxxxx#28 nsp=-1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=257: ixxxxx#28 nsp=-1 mass=-400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=257: ixxxxx#28 nsp=-1 mass=-400 + -2.433105012119288e+01, -0.000000000000000e+00, // itest=257: ixxxxx#28 nsp=-1 mass=-400 + 4.931969619160719e+00, -5.260767593771432e+00, // itest=257: ixxxxx#28 nsp=-1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00 } ); // itest=257: ixxxxx#28 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=258: vxxxxx#28 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=258: vxxxxx#28 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=258: vxxxxx#28 nsp=-1 mass=400 + -2.321373168788980e-01, 5.158607041753289e-01, // itest=258: vxxxxx#28 nsp=-1 mass=400 + -2.476131380041579e-01, -4.836194101643708e-01, // itest=258: vxxxxx#28 nsp=-1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=258: vxxxxx#28 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=259: vxxxxx#28 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=259: vxxxxx#28 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=259: vxxxxx#28 nsp=-1 mass=-400 + -2.321373168788980e-01, 5.158607041753289e-01, // itest=259: vxxxxx#28 nsp=-1 mass=-400 + -2.476131380041579e-01, -4.836194101643708e-01, // itest=259: vxxxxx#28 nsp=-1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=259: vxxxxx#28 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=260: sxxxxx#28 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=260: sxxxxx#28 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=260: sxxxxx#28 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=260: sxxxxx#28 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=260: sxxxxx#28 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=260: sxxxxx#28 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=261: sxxxxx#28 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=261: sxxxxx#28 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=261: sxxxxx#28 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=261: sxxxxx#28 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=261: sxxxxx#28 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=261: sxxxxx#28 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=262: oxxxxx#28 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=262: oxxxxx#28 nsp=-1 mass=400 + -4.931969619160719e+00, -5.260767593771432e+00, // itest=262: oxxxxx#28 nsp=-1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00, // itest=262: oxxxxx#28 nsp=-1 mass=400 + 9.863939238321439e+00, 1.052153518754287e+01, // itest=262: oxxxxx#28 nsp=-1 mass=400 + -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=262: oxxxxx#28 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=263: oxxxxx#28 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=263: oxxxxx#28 nsp=-1 mass=-400 + 4.931969619160719e+00, 5.260767593771432e+00, // itest=263: oxxxxx#28 nsp=-1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00, // itest=263: oxxxxx#28 nsp=-1 mass=-400 + 9.863939238321439e+00, 1.052153518754287e+01, // itest=263: oxxxxx#28 nsp=-1 mass=-400 + -2.433105012119288e+01, -0.000000000000000e+00 } ); // itest=263: oxxxxx#28 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=264: ixxxxx#29 nsp=-1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=264: ixxxxx#29 nsp=-1 mass=400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=264: ixxxxx#29 nsp=-1 mass=400 + -1.442220510185596e+01, -0.000000000000000e+00, // itest=264: ixxxxx#29 nsp=-1 mass=400 + -8.320502943378436e+00, 8.875203139603666e+00, // itest=264: ixxxxx#29 nsp=-1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00 } ); // itest=264: ixxxxx#29 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=265: ixxxxx#29 nsp=-1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=265: ixxxxx#29 nsp=-1 mass=-400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=265: ixxxxx#29 nsp=-1 mass=-400 + -1.442220510185596e+01, -0.000000000000000e+00, // itest=265: ixxxxx#29 nsp=-1 mass=-400 + 8.320502943378436e+00, -8.875203139603666e+00, // itest=265: ixxxxx#29 nsp=-1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00 } ); // itest=265: ixxxxx#29 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=266: vxxxxx#29 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=266: vxxxxx#29 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=266: vxxxxx#29 nsp=-1 mass=400 + 2.321373168788980e-01, 5.158607041753289e-01, // itest=266: vxxxxx#29 nsp=-1 mass=400 + 2.476131380041579e-01, -4.836194101643708e-01, // itest=266: vxxxxx#29 nsp=-1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=266: vxxxxx#29 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=267: vxxxxx#29 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=267: vxxxxx#29 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=267: vxxxxx#29 nsp=-1 mass=-400 + 2.321373168788980e-01, 5.158607041753289e-01, // itest=267: vxxxxx#29 nsp=-1 mass=-400 + 2.476131380041579e-01, -4.836194101643708e-01, // itest=267: vxxxxx#29 nsp=-1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=267: vxxxxx#29 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=268: sxxxxx#29 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=268: sxxxxx#29 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=268: sxxxxx#29 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=268: sxxxxx#29 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=268: sxxxxx#29 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=268: sxxxxx#29 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=269: sxxxxx#29 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=269: sxxxxx#29 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=269: sxxxxx#29 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=269: sxxxxx#29 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=269: sxxxxx#29 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=269: sxxxxx#29 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=270: oxxxxx#29 nsp=-1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=270: oxxxxx#29 nsp=-1 mass=400 + -8.320502943378436e+00, -8.875203139603666e+00, // itest=270: oxxxxx#29 nsp=-1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00, // itest=270: oxxxxx#29 nsp=-1 mass=400 + 1.664100588675688e+01, 1.775040627920733e+01, // itest=270: oxxxxx#29 nsp=-1 mass=400 + -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=270: oxxxxx#29 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=271: oxxxxx#29 nsp=-1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=271: oxxxxx#29 nsp=-1 mass=-400 + 8.320502943378436e+00, 8.875203139603666e+00, // itest=271: oxxxxx#29 nsp=-1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00, // itest=271: oxxxxx#29 nsp=-1 mass=-400 + 1.664100588675688e+01, 1.775040627920733e+01, // itest=271: oxxxxx#29 nsp=-1 mass=-400 + -1.442220510185596e+01, -0.000000000000000e+00 } ); // itest=271: oxxxxx#29 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=272: ixxxxx#30 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=272: ixxxxx#30 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=272: ixxxxx#30 nsp=-1 mass=500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=272: ixxxxx#30 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=272: ixxxxx#30 nsp=-1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=272: ixxxxx#30 nsp=-1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=273: ixxxxx#30 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=273: ixxxxx#30 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=273: ixxxxx#30 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=273: ixxxxx#30 nsp=-1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=273: ixxxxx#30 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=273: ixxxxx#30 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=274: vxxxxx#30 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=274: vxxxxx#30 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=274: vxxxxx#30 nsp=-1 mass=500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=274: vxxxxx#30 nsp=-1 mass=500 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=274: vxxxxx#30 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=274: vxxxxx#30 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=275: vxxxxx#30 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=275: vxxxxx#30 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=275: vxxxxx#30 nsp=-1 mass=-500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=275: vxxxxx#30 nsp=-1 mass=-500 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=275: vxxxxx#30 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=275: vxxxxx#30 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=276: sxxxxx#30 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=276: sxxxxx#30 nsp=-1 mass=500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=276: sxxxxx#30 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=276: sxxxxx#30 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=276: sxxxxx#30 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=276: sxxxxx#30 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=277: sxxxxx#30 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=277: sxxxxx#30 nsp=-1 mass=-500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=277: sxxxxx#30 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=277: sxxxxx#30 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=277: sxxxxx#30 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=277: sxxxxx#30 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=278: oxxxxx#30 nsp=-1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=278: oxxxxx#30 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=278: oxxxxx#30 nsp=-1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=278: oxxxxx#30 nsp=-1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=278: oxxxxx#30 nsp=-1 mass=500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=278: oxxxxx#30 nsp=-1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=279: oxxxxx#30 nsp=-1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=279: oxxxxx#30 nsp=-1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=279: oxxxxx#30 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=279: oxxxxx#30 nsp=-1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=279: oxxxxx#30 nsp=-1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00 } ); // itest=279: oxxxxx#30 nsp=-1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=280: ixxxxx#31 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=280: ixxxxx#31 nsp=-1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=280: ixxxxx#31 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=280: ixxxxx#31 nsp=-1 mass=400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=280: ixxxxx#31 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00 } ); // itest=280: ixxxxx#31 nsp=-1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=281: ixxxxx#31 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=281: ixxxxx#31 nsp=-1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=281: ixxxxx#31 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00, // itest=281: ixxxxx#31 nsp=-1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=281: ixxxxx#31 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=281: ixxxxx#31 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=282: vxxxxx#31 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=282: vxxxxx#31 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=282: vxxxxx#31 nsp=-1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=282: vxxxxx#31 nsp=-1 mass=400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=282: vxxxxx#31 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=282: vxxxxx#31 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=283: vxxxxx#31 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=283: vxxxxx#31 nsp=-1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=283: vxxxxx#31 nsp=-1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=283: vxxxxx#31 nsp=-1 mass=-400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=283: vxxxxx#31 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=283: vxxxxx#31 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=284: sxxxxx#31 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=284: sxxxxx#31 nsp=-1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=284: sxxxxx#31 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=284: sxxxxx#31 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=284: sxxxxx#31 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=284: sxxxxx#31 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=285: sxxxxx#31 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=285: sxxxxx#31 nsp=-1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=285: sxxxxx#31 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=285: sxxxxx#31 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=285: sxxxxx#31 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=285: sxxxxx#31 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=286: oxxxxx#31 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=286: oxxxxx#31 nsp=-1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=286: oxxxxx#31 nsp=-1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=286: oxxxxx#31 nsp=-1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=286: oxxxxx#31 nsp=-1 mass=400 + -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=286: oxxxxx#31 nsp=-1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=287: oxxxxx#31 nsp=-1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=287: oxxxxx#31 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=287: oxxxxx#31 nsp=-1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=287: oxxxxx#31 nsp=-1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=287: oxxxxx#31 nsp=-1 mass=-400 + -2.828427124746190e+01, -0.000000000000000e+00 } ); // itest=287: oxxxxx#31 nsp=-1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=288: ixxxxx#0 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=288: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=288: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=288: ixxxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=288: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=288: ixxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=289: ixxxxx#0 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=289: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=289: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=289: ixxxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=289: ixxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=289: ixxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=290: ipzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=290: ipzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=290: ipzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=290: ipzxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=290: ipzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=290: ipzxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=291: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=291: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=291: vxxxxx#0 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=291: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=291: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=291: vxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=292: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=292: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=292: vxxxxx#0 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=292: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=292: vxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=292: vxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=293: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=293: sxxxxx#0 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=293: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=293: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=293: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=293: sxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=294: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=294: sxxxxx#0 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=294: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=294: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=294: sxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=294: sxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=295: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=295: oxxxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=295: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=295: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=295: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=295: oxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=296: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=296: oxxxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=296: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=296: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=296: oxxxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=296: oxxxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=297: opzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=297: opzxxx#0 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=297: opzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=297: opzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=297: opzxxx#0 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=297: opzxxx#0 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=298: ixxxxx#1 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=298: ixxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=298: ixxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=298: ixxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=298: ixxxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=298: ixxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=299: ixxxxx#1 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=299: ixxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=299: ixxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=299: ixxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=299: ixxxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=299: ixxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=300: imzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=300: imzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=300: imzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=300: imzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=300: imzxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=300: imzxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=301: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=301: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=301: vxxxxx#1 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=301: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=301: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=301: vxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=302: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=302: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=302: vxxxxx#1 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=302: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=302: vxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=302: vxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=303: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=303: sxxxxx#1 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=303: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=303: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=303: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=303: sxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=304: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=304: sxxxxx#1 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=304: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=304: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=304: sxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=304: sxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=305: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=305: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=305: oxxxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=305: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=305: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=305: oxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=306: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=306: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=306: oxxxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=306: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=306: oxxxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=306: oxxxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=307: omzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=307: omzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=307: omzxxx#1 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=307: omzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=307: omzxxx#1 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=307: omzxxx#1 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=308: ixxxxx#2 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=308: ixxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=308: ixxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=308: ixxxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=308: ixxxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01 } ); // itest=308: ixxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=309: ixxxxx#2 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=309: ixxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=309: ixxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=309: ixxxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=309: ixxxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01 } ); // itest=309: ixxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=310: ixzxxx#2 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=310: ixzxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=310: ixzxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=310: ixzxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=310: ixzxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=310: ixzxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=311: vxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=311: vxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=311: vxxxxx#2 nsp=1 mass=0 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=311: vxxxxx#2 nsp=1 mass=0 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=311: vxxxxx#2 nsp=1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=311: vxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=312: vxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=312: vxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=312: vxxxxx#2 nsp=1 mass=0 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=312: vxxxxx#2 nsp=1 mass=0 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=312: vxxxxx#2 nsp=1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=312: vxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=313: sxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=313: sxxxxx#2 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=313: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=313: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=313: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=313: sxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=314: sxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=314: sxxxxx#2 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=314: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=314: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=314: sxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=314: sxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=315: oxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=315: oxxxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=315: oxxxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=315: oxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=315: oxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=315: oxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=316: oxxxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=316: oxxxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=316: oxxxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=316: oxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=316: oxxxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=316: oxxxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=317: oxzxxx#2 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=317: oxzxxx#2 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=317: oxzxxx#2 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=317: oxzxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=317: oxzxxx#2 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=317: oxzxxx#2 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=318: ixxxxx#3 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=318: ixxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=318: ixxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=318: ixxxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=318: ixxxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=318: ixxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=319: ixxxxx#3 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=319: ixxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=319: ixxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=319: ixxxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=319: ixxxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=319: ixxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=320: ixzxxx#3 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=320: ixzxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=320: ixzxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=320: ixzxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=320: ixzxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=320: ixzxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=321: vxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=321: vxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=321: vxxxxx#3 nsp=1 mass=0 + -3.394112549695428e-01, -5.656854249492381e-01, // itest=321: vxxxxx#3 nsp=1 mass=0 + -4.525483399593904e-01, 4.242640687119285e-01, // itest=321: vxxxxx#3 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=321: vxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=322: vxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=322: vxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=322: vxxxxx#3 nsp=1 mass=0 + -3.394112549695428e-01, -5.656854249492381e-01, // itest=322: vxxxxx#3 nsp=1 mass=0 + -4.525483399593904e-01, 4.242640687119285e-01, // itest=322: vxxxxx#3 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=322: vxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=323: sxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=323: sxxxxx#3 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=323: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=323: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=323: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=323: sxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=324: sxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=324: sxxxxx#3 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=324: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=324: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=324: sxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=324: sxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=325: oxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=325: oxxxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=325: oxxxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=325: oxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=325: oxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=325: oxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=326: oxxxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=326: oxxxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=326: oxxxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=326: oxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=326: oxxxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=326: oxxxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=327: oxzxxx#3 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=327: oxzxxx#3 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=327: oxzxxx#3 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=327: oxzxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=327: oxzxxx#3 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=327: oxzxxx#3 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=328: ixxxxx#4 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=328: ixxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=328: ixxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=328: ixxxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=328: ixxxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=328: ixxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=329: ixxxxx#4 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=329: ixxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=329: ixxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=329: ixxxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=329: ixxxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=329: ixxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=330: ixzxxx#4 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=330: ixzxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=330: ixzxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=330: ixzxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=330: ixzxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=330: ixzxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=331: vxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=331: vxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=331: vxxxxx#4 nsp=1 mass=0 + 3.394112549695428e-01, -5.656854249492381e-01, // itest=331: vxxxxx#4 nsp=1 mass=0 + 4.525483399593904e-01, 4.242640687119285e-01, // itest=331: vxxxxx#4 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=331: vxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=332: vxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=332: vxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=332: vxxxxx#4 nsp=1 mass=0 + 3.394112549695428e-01, -5.656854249492381e-01, // itest=332: vxxxxx#4 nsp=1 mass=0 + 4.525483399593904e-01, 4.242640687119285e-01, // itest=332: vxxxxx#4 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=332: vxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=333: sxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=333: sxxxxx#4 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=333: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=333: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=333: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=333: sxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=334: sxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=334: sxxxxx#4 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=334: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=334: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=334: sxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=334: sxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=335: oxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=335: oxxxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=335: oxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=335: oxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=335: oxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=335: oxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=336: oxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=336: oxxxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=336: oxxxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=336: oxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=336: oxxxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=336: oxxxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=337: oxzxxx#4 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=337: oxzxxx#4 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=337: oxzxxx#4 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=337: oxzxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=337: oxzxxx#4 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=337: oxzxxx#4 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=338: ixxxxx#5 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=338: ixxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=338: ixxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=338: ixxxxx#5 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=338: ixxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=338: ixxxxx#5 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=339: ixxxxx#5 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=339: ixxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=339: ixxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=339: ixxxxx#5 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=339: ixxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=339: ixxxxx#5 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=340: ipzxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=340: ipzxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=340: ipzxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=340: ipzxxx#5 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=340: ipzxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=340: ipzxxx#5 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=341: vxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=341: vxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=341: vxxxxx#5 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=341: vxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=341: vxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=341: vxxxxx#5 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=342: vxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=342: vxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=342: vxxxxx#5 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=342: vxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=342: vxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=342: vxxxxx#5 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=343: sxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=343: sxxxxx#5 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=343: sxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=343: sxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=343: sxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=343: sxxxxx#5 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=344: sxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=344: sxxxxx#5 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=344: sxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=344: sxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=344: sxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=344: sxxxxx#5 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=345: oxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=345: oxxxxx#5 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=345: oxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=345: oxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=345: oxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=345: oxxxxx#5 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=346: oxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=346: oxxxxx#5 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=346: oxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=346: oxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=346: oxxxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=346: oxxxxx#5 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=347: opzxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=347: opzxxx#5 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=347: opzxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=347: opzxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=347: opzxxx#5 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=347: opzxxx#5 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=348: ixxxxx#6 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=348: ixxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=348: ixxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=348: ixxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=348: ixxxxx#6 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=348: ixxxxx#6 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=349: ixxxxx#6 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=349: ixxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=349: ixxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=349: ixxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=349: ixxxxx#6 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=349: ixxxxx#6 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=350: imzxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=350: imzxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=350: imzxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=350: imzxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=350: imzxxx#6 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=350: imzxxx#6 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=351: vxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=351: vxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=351: vxxxxx#6 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=351: vxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=351: vxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=351: vxxxxx#6 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=352: vxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=352: vxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=352: vxxxxx#6 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=352: vxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=352: vxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=352: vxxxxx#6 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=353: sxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=353: sxxxxx#6 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=353: sxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=353: sxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=353: sxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=353: sxxxxx#6 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=354: sxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=354: sxxxxx#6 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=354: sxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=354: sxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=354: sxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=354: sxxxxx#6 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=355: oxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=355: oxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=355: oxxxxx#6 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=355: oxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=355: oxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=355: oxxxxx#6 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=356: oxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=356: oxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=356: oxxxxx#6 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=356: oxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=356: oxxxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=356: oxxxxx#6 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=357: omzxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=357: omzxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=357: omzxxx#6 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=357: omzxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=357: omzxxx#6 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=357: omzxxx#6 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=358: ixxxxx#7 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=358: ixxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=358: ixxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=358: ixxxxx#7 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=358: ixxxxx#7 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01 } ); // itest=358: ixxxxx#7 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=359: ixxxxx#7 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=359: ixxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=359: ixxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=359: ixxxxx#7 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=359: ixxxxx#7 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01 } ); // itest=359: ixxxxx#7 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=360: ixzxxx#7 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=360: ixzxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=360: ixzxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=360: ixzxxx#7 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=360: ixzxxx#7 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=360: ixzxxx#7 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=361: vxxxxx#7 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=361: vxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=361: vxxxxx#7 nsp=1 mass=0 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=361: vxxxxx#7 nsp=1 mass=0 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=361: vxxxxx#7 nsp=1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=361: vxxxxx#7 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=362: vxxxxx#7 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=362: vxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=362: vxxxxx#7 nsp=1 mass=0 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=362: vxxxxx#7 nsp=1 mass=0 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=362: vxxxxx#7 nsp=1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=362: vxxxxx#7 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=363: sxxxxx#7 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=363: sxxxxx#7 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=363: sxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=363: sxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=363: sxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=363: sxxxxx#7 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=364: sxxxxx#7 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=364: sxxxxx#7 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=364: sxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=364: sxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=364: sxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=364: sxxxxx#7 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=365: oxxxxx#7 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=365: oxxxxx#7 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=365: oxxxxx#7 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=365: oxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=365: oxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=365: oxxxxx#7 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=366: oxxxxx#7 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=366: oxxxxx#7 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=366: oxxxxx#7 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=366: oxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=366: oxxxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=366: oxxxxx#7 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=367: oxzxxx#7 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=367: oxzxxx#7 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=367: oxzxxx#7 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=367: oxzxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=367: oxzxxx#7 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=367: oxzxxx#7 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=368: ixxxxx#8 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=368: ixxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=368: ixxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=368: ixxxxx#8 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=368: ixxxxx#8 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=368: ixxxxx#8 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=369: ixxxxx#8 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=369: ixxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=369: ixxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=369: ixxxxx#8 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=369: ixxxxx#8 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=369: ixxxxx#8 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=370: ixzxxx#8 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=370: ixzxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=370: ixzxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=370: ixzxxx#8 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=370: ixzxxx#8 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=370: ixzxxx#8 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=371: vxxxxx#8 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=371: vxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=371: vxxxxx#8 nsp=1 mass=0 + -3.394112549695428e-01, -5.656854249492381e-01, // itest=371: vxxxxx#8 nsp=1 mass=0 + -4.525483399593904e-01, 4.242640687119285e-01, // itest=371: vxxxxx#8 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=371: vxxxxx#8 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=372: vxxxxx#8 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=372: vxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=372: vxxxxx#8 nsp=1 mass=0 + -3.394112549695428e-01, -5.656854249492381e-01, // itest=372: vxxxxx#8 nsp=1 mass=0 + -4.525483399593904e-01, 4.242640687119285e-01, // itest=372: vxxxxx#8 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=372: vxxxxx#8 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=373: sxxxxx#8 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=373: sxxxxx#8 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=373: sxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=373: sxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=373: sxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=373: sxxxxx#8 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=374: sxxxxx#8 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=374: sxxxxx#8 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=374: sxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=374: sxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=374: sxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=374: sxxxxx#8 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=375: oxxxxx#8 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=375: oxxxxx#8 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=375: oxxxxx#8 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=375: oxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=375: oxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=375: oxxxxx#8 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=376: oxxxxx#8 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=376: oxxxxx#8 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=376: oxxxxx#8 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=376: oxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=376: oxxxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=376: oxxxxx#8 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=377: oxzxxx#8 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=377: oxzxxx#8 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=377: oxzxxx#8 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=377: oxzxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=377: oxzxxx#8 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=377: oxzxxx#8 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=378: ixxxxx#9 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=378: ixxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=378: ixxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=378: ixxxxx#9 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=378: ixxxxx#9 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=378: ixxxxx#9 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=379: ixxxxx#9 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=379: ixxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=379: ixxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=379: ixxxxx#9 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=379: ixxxxx#9 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=379: ixxxxx#9 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=380: ixzxxx#9 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=380: ixzxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=380: ixzxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=380: ixzxxx#9 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=380: ixzxxx#9 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=380: ixzxxx#9 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=381: vxxxxx#9 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=381: vxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=381: vxxxxx#9 nsp=1 mass=0 + 3.394112549695428e-01, -5.656854249492381e-01, // itest=381: vxxxxx#9 nsp=1 mass=0 + 4.525483399593904e-01, 4.242640687119285e-01, // itest=381: vxxxxx#9 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=381: vxxxxx#9 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=382: vxxxxx#9 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=382: vxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=382: vxxxxx#9 nsp=1 mass=0 + 3.394112549695428e-01, -5.656854249492381e-01, // itest=382: vxxxxx#9 nsp=1 mass=0 + 4.525483399593904e-01, 4.242640687119285e-01, // itest=382: vxxxxx#9 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=382: vxxxxx#9 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=383: sxxxxx#9 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=383: sxxxxx#9 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=383: sxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=383: sxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=383: sxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=383: sxxxxx#9 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=384: sxxxxx#9 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=384: sxxxxx#9 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=384: sxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=384: sxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=384: sxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=384: sxxxxx#9 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=385: oxxxxx#9 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=385: oxxxxx#9 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=385: oxxxxx#9 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=385: oxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=385: oxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=385: oxxxxx#9 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=386: oxxxxx#9 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=386: oxxxxx#9 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=386: oxxxxx#9 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=386: oxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=386: oxxxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=386: oxxxxx#9 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=387: oxzxxx#9 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=387: oxzxxx#9 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=387: oxzxxx#9 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=387: oxzxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=387: oxzxxx#9 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=387: oxzxxx#9 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=388: ixxxxx#10 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=388: ixxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=388: ixxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=388: ixxxxx#10 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=388: ixxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=388: ixxxxx#10 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=389: ixxxxx#10 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=389: ixxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=389: ixxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=389: ixxxxx#10 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=389: ixxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=389: ixxxxx#10 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=390: ipzxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=390: ipzxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=390: ipzxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=390: ipzxxx#10 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=390: ipzxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=390: ipzxxx#10 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=391: vxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=391: vxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=391: vxxxxx#10 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=391: vxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=391: vxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=391: vxxxxx#10 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=392: vxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=392: vxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=392: vxxxxx#10 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=392: vxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=392: vxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=392: vxxxxx#10 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=393: sxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=393: sxxxxx#10 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=393: sxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=393: sxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=393: sxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=393: sxxxxx#10 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=394: sxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=394: sxxxxx#10 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=394: sxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=394: sxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=394: sxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=394: sxxxxx#10 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=395: oxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=395: oxxxxx#10 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=395: oxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=395: oxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=395: oxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=395: oxxxxx#10 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=396: oxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=396: oxxxxx#10 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=396: oxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=396: oxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=396: oxxxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=396: oxxxxx#10 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=397: opzxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=397: opzxxx#10 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=397: opzxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=397: opzxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=397: opzxxx#10 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=397: opzxxx#10 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=398: ixxxxx#11 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=398: ixxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=398: ixxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=398: ixxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=398: ixxxxx#11 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=398: ixxxxx#11 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=399: ixxxxx#11 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=399: ixxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=399: ixxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=399: ixxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=399: ixxxxx#11 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=399: ixxxxx#11 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 5.000000000000000e+02, // itest=400: imzxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=400: imzxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=400: imzxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=400: imzxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=400: imzxxx#11 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00 } ); // itest=400: imzxxx#11 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=401: vxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=401: vxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=401: vxxxxx#11 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=401: vxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=401: vxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=401: vxxxxx#11 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=402: vxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=402: vxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=402: vxxxxx#11 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=402: vxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=402: vxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=402: vxxxxx#11 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=403: sxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=403: sxxxxx#11 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=403: sxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=403: sxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=403: sxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=403: sxxxxx#11 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=404: sxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=404: sxxxxx#11 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=404: sxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=404: sxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=404: sxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=404: sxxxxx#11 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=405: oxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=405: oxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=405: oxxxxx#11 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=405: oxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=405: oxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=405: oxxxxx#11 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=406: oxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=406: oxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=406: oxxxxx#11 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=406: oxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=406: oxxxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=406: oxxxxx#11 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -5.000000000000000e+02, // itest=407: omzxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=407: omzxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=407: omzxxx#11 nsp=1 mass=0 + -3.162277660168379e+01, 0.000000000000000e+00, // itest=407: omzxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=407: omzxxx#11 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=407: omzxxx#11 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=408: ixxxxx#12 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=408: ixxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=408: ixxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=408: ixxxxx#12 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=408: ixxxxx#12 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01 } ); // itest=408: ixxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=409: ixxxxx#12 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=409: ixxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=409: ixxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=409: ixxxxx#12 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=409: ixxxxx#12 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999831e+01 } ); // itest=409: ixxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=410: ixzxxx#12 nsp=1 mass=0 + -3.000000000000000e+02, -4.000000000000000e+02, // itest=410: ixzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=410: ixzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=410: ixzxxx#12 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=410: ixzxxx#12 nsp=1 mass=0 + 1.341640786499874e+01, 1.788854381999832e+01 } ); // itest=410: ixzxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=411: vxxxxx#12 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=411: vxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=411: vxxxxx#12 nsp=1 mass=0 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=411: vxxxxx#12 nsp=1 mass=0 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=411: vxxxxx#12 nsp=1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=411: vxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=412: vxxxxx#12 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=412: vxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=412: vxxxxx#12 nsp=1 mass=0 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=412: vxxxxx#12 nsp=1 mass=0 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=412: vxxxxx#12 nsp=1 mass=0 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=412: vxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=413: sxxxxx#12 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=413: sxxxxx#12 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=413: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=413: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=413: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=413: sxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=414: sxxxxx#12 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=414: sxxxxx#12 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=414: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=414: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=414: sxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=414: sxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=415: oxxxxx#12 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=415: oxxxxx#12 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=415: oxxxxx#12 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=415: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=415: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=415: oxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=416: oxxxxx#12 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=416: oxxxxx#12 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=416: oxxxxx#12 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999831e+01, // itest=416: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=416: oxxxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=416: oxxxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=417: oxzxxx#12 nsp=1 mass=0 + 3.000000000000000e+02, 4.000000000000000e+02, // itest=417: oxzxxx#12 nsp=1 mass=0 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=417: oxzxxx#12 nsp=1 mass=0 + 1.341640786499874e+01, -1.788854381999832e+01, // itest=417: oxzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=417: oxzxxx#12 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=417: oxzxxx#12 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=418: ixxxxx#13 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=418: ixxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=418: ixxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=418: ixxxxx#13 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=418: ixxxxx#13 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=418: ixxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=419: ixxxxx#13 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=419: ixxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=419: ixxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=419: ixxxxx#13 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=419: ixxxxx#13 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=419: ixxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -4.000000000000000e+02, // itest=420: ixzxxx#13 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=420: ixzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=420: ixzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=420: ixzxxx#13 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=420: ixzxxx#13 nsp=1 mass=0 + 6.000000000000000e+00, 8.000000000000000e+00 } ); // itest=420: ixzxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=421: vxxxxx#13 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=421: vxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=421: vxxxxx#13 nsp=1 mass=0 + -3.394112549695428e-01, -5.656854249492381e-01, // itest=421: vxxxxx#13 nsp=1 mass=0 + -4.525483399593904e-01, 4.242640687119285e-01, // itest=421: vxxxxx#13 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=421: vxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=422: vxxxxx#13 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=422: vxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=422: vxxxxx#13 nsp=1 mass=0 + -3.394112549695428e-01, -5.656854249492381e-01, // itest=422: vxxxxx#13 nsp=1 mass=0 + -4.525483399593904e-01, 4.242640687119285e-01, // itest=422: vxxxxx#13 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=422: vxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=423: sxxxxx#13 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=423: sxxxxx#13 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=423: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=423: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=423: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=423: sxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=424: sxxxxx#13 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=424: sxxxxx#13 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=424: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=424: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=424: sxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=424: sxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=425: oxxxxx#13 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=425: oxxxxx#13 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=425: oxxxxx#13 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=425: oxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=425: oxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=425: oxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=426: oxxxxx#13 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=426: oxxxxx#13 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=426: oxxxxx#13 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=426: oxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=426: oxxxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=426: oxxxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 4.000000000000000e+02, // itest=427: oxzxxx#13 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=427: oxzxxx#13 nsp=1 mass=0 + 3.000000000000000e+01, 0.000000000000000e+00, // itest=427: oxzxxx#13 nsp=1 mass=0 + 6.000000000000000e+00, -8.000000000000000e+00, // itest=427: oxzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=427: oxzxxx#13 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=427: oxzxxx#13 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=428: ixxxxx#14 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=428: ixxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=428: ixxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=428: ixxxxx#14 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=428: ixxxxx#14 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=428: ixxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=429: ixxxxx#14 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=429: ixxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=429: ixxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=429: ixxxxx#14 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=429: ixxxxx#14 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=429: ixxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 4.000000000000000e+02, // itest=430: ixzxxx#14 nsp=1 mass=0 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=430: ixzxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=430: ixzxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=430: ixzxxx#14 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=430: ixzxxx#14 nsp=1 mass=0 + 1.800000000000000e+01, 2.400000000000000e+01 } ); // itest=430: ixzxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=431: vxxxxx#14 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=431: vxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=431: vxxxxx#14 nsp=1 mass=0 + 3.394112549695428e-01, -5.656854249492381e-01, // itest=431: vxxxxx#14 nsp=1 mass=0 + 4.525483399593904e-01, 4.242640687119285e-01, // itest=431: vxxxxx#14 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=431: vxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=432: vxxxxx#14 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=432: vxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=432: vxxxxx#14 nsp=1 mass=0 + 3.394112549695428e-01, -5.656854249492381e-01, // itest=432: vxxxxx#14 nsp=1 mass=0 + 4.525483399593904e-01, 4.242640687119285e-01, // itest=432: vxxxxx#14 nsp=1 mass=0 + 4.242640687119285e-01, 0.000000000000000e+00 } ); // itest=432: vxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=433: sxxxxx#14 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=433: sxxxxx#14 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=433: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=433: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=433: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=433: sxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=434: sxxxxx#14 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=434: sxxxxx#14 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=434: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=434: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=434: sxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=434: sxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=435: oxxxxx#14 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=435: oxxxxx#14 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=435: oxxxxx#14 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=435: oxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=435: oxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=435: oxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=436: oxxxxx#14 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=436: oxxxxx#14 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=436: oxxxxx#14 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=436: oxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=436: oxxxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=436: oxxxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -4.000000000000000e+02, // itest=437: oxzxxx#14 nsp=1 mass=0 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=437: oxzxxx#14 nsp=1 mass=0 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=437: oxzxxx#14 nsp=1 mass=0 + 1.800000000000000e+01, -2.400000000000000e+01, // itest=437: oxzxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=437: oxzxxx#14 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=437: oxzxxx#14 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=438: ixxxxx#15 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=438: ixxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=438: ixxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=438: ixxxxx#15 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=438: ixxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=438: ixxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=439: ixxxxx#15 nsp=1 mass=0 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=439: ixxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=439: ixxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=439: ixxxxx#15 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=439: ixxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=439: ixxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -5.000000000000000e+02, // itest=440: ipzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=440: ipzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=440: ipzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=440: ipzxxx#15 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=440: ipzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=440: ipzxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=441: vxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=441: vxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=441: vxxxxx#15 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=441: vxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=441: vxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=441: vxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=442: vxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=442: vxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=442: vxxxxx#15 nsp=1 mass=0 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=442: vxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=442: vxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=442: vxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=443: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=443: sxxxxx#15 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=443: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=443: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=443: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=443: sxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=444: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=444: sxxxxx#15 nsp=1 mass=0 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=444: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=444: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=444: sxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=444: sxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=445: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=445: oxxxxx#15 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=445: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=445: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=445: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=445: oxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=446: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=446: oxxxxx#15 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=446: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=446: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=446: oxxxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=446: oxxxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 5.000000000000000e+02, // itest=447: opzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=447: opzxxx#15 nsp=1 mass=0 + 3.162277660168379e+01, 0.000000000000000e+00, // itest=447: opzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=447: opzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=447: opzxxx#15 nsp=1 mass=0 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=447: opzxxx#15 nsp=1 mass=0 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=448: ixxxxx#16 nsp=1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=448: ixxxxx#16 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=448: ixxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=448: ixxxxx#16 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=448: ixxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=448: ixxxxx#16 nsp=1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=449: ixxxxx#16 nsp=1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=449: ixxxxx#16 nsp=1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=449: ixxxxx#16 nsp=1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=449: ixxxxx#16 nsp=1 mass=-500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=449: ixxxxx#16 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=449: ixxxxx#16 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=450: vxxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=450: vxxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=450: vxxxxx#16 nsp=1 mass=500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=450: vxxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=450: vxxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=450: vxxxxx#16 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=451: vxxxxx#16 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=451: vxxxxx#16 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=451: vxxxxx#16 nsp=1 mass=-500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=451: vxxxxx#16 nsp=1 mass=-500 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=451: vxxxxx#16 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=451: vxxxxx#16 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=452: sxxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=452: sxxxxx#16 nsp=1 mass=500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=452: sxxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=452: sxxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=452: sxxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=452: sxxxxx#16 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=453: sxxxxx#16 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=453: sxxxxx#16 nsp=1 mass=-500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=453: sxxxxx#16 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=453: sxxxxx#16 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=453: sxxxxx#16 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=453: sxxxxx#16 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=454: oxxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=454: oxxxxx#16 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=454: oxxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=454: oxxxxx#16 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=454: oxxxxx#16 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=454: oxxxxx#16 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=455: oxxxxx#16 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=455: oxxxxx#16 nsp=1 mass=-500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=455: oxxxxx#16 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=455: oxxxxx#16 nsp=1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=455: oxxxxx#16 nsp=1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=455: oxxxxx#16 nsp=1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=456: ixxxxx#17 nsp=1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=456: ixxxxx#17 nsp=1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=456: ixxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=456: ixxxxx#17 nsp=1 mass=400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=456: ixxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=456: ixxxxx#17 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=457: ixxxxx#17 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=457: ixxxxx#17 nsp=1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=457: ixxxxx#17 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=457: ixxxxx#17 nsp=1 mass=-400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=457: ixxxxx#17 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=457: ixxxxx#17 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=458: vxxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=458: vxxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=458: vxxxxx#17 nsp=1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=458: vxxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=458: vxxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=458: vxxxxx#17 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=459: vxxxxx#17 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=459: vxxxxx#17 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=459: vxxxxx#17 nsp=1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=459: vxxxxx#17 nsp=1 mass=-400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=459: vxxxxx#17 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=459: vxxxxx#17 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=460: sxxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=460: sxxxxx#17 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=460: sxxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=460: sxxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=460: sxxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=460: sxxxxx#17 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=461: sxxxxx#17 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=461: sxxxxx#17 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=461: sxxxxx#17 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=461: sxxxxx#17 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=461: sxxxxx#17 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=461: sxxxxx#17 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=462: oxxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=462: oxxxxx#17 nsp=1 mass=400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=462: oxxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=462: oxxxxx#17 nsp=1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=462: oxxxxx#17 nsp=1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=462: oxxxxx#17 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=463: oxxxxx#17 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=463: oxxxxx#17 nsp=1 mass=-400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=463: oxxxxx#17 nsp=1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=463: oxxxxx#17 nsp=1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=463: oxxxxx#17 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=463: oxxxxx#17 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=464: ixxxxx#18 nsp=1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=464: ixxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=464: ixxxxx#18 nsp=1 mass=400 + -1.414213562373095e+01, 0.000000000000000e+00, // itest=464: ixxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=464: ixxxxx#18 nsp=1 mass=400 + -2.828427124746190e+01, 0.000000000000000e+00 } ); // itest=464: ixxxxx#18 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=465: ixxxxx#18 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=465: ixxxxx#18 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=465: ixxxxx#18 nsp=1 mass=-400 + 1.414213562373095e+01, -0.000000000000000e+00, // itest=465: ixxxxx#18 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=465: ixxxxx#18 nsp=1 mass=-400 + -2.828427124746190e+01, 0.000000000000000e+00 } ); // itest=465: ixxxxx#18 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=466: vxxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=466: vxxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=466: vxxxxx#18 nsp=1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=466: vxxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=466: vxxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=466: vxxxxx#18 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=467: vxxxxx#18 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=467: vxxxxx#18 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=467: vxxxxx#18 nsp=1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=467: vxxxxx#18 nsp=1 mass=-400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=467: vxxxxx#18 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=467: vxxxxx#18 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=468: sxxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=468: sxxxxx#18 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=468: sxxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=468: sxxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=468: sxxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=468: sxxxxx#18 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=469: sxxxxx#18 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=469: sxxxxx#18 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=469: sxxxxx#18 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=469: sxxxxx#18 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=469: sxxxxx#18 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=469: sxxxxx#18 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=470: oxxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=470: oxxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=470: oxxxxx#18 nsp=1 mass=400 + -2.828427124746190e+01, 0.000000000000000e+00, // itest=470: oxxxxx#18 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=470: oxxxxx#18 nsp=1 mass=400 + -1.414213562373095e+01, 0.000000000000000e+00 } ); // itest=470: oxxxxx#18 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=471: oxxxxx#18 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=471: oxxxxx#18 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=471: oxxxxx#18 nsp=1 mass=-400 + -2.828427124746190e+01, 0.000000000000000e+00, // itest=471: oxxxxx#18 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=471: oxxxxx#18 nsp=1 mass=-400 + 1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=471: oxxxxx#18 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=472: ixxxxx#19 nsp=1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=472: ixxxxx#19 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=472: ixxxxx#19 nsp=1 mass=400 + 5.999999999999999e+00, 7.999999999999999e+00, // itest=472: ixxxxx#19 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=472: ixxxxx#19 nsp=1 mass=400 + 1.200000000000000e+01, 1.600000000000000e+01 } ); // itest=472: ixxxxx#19 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=473: ixxxxx#19 nsp=1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=473: ixxxxx#19 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=473: ixxxxx#19 nsp=1 mass=-400 + -5.999999999999999e+00, -7.999999999999999e+00, // itest=473: ixxxxx#19 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=473: ixxxxx#19 nsp=1 mass=-400 + 1.200000000000000e+01, 1.600000000000000e+01 } ); // itest=473: ixxxxx#19 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=474: vxxxxx#19 nsp=1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=474: vxxxxx#19 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=474: vxxxxx#19 nsp=1 mass=400 + 0.000000000000000e+00, -5.656854249492381e-01, // itest=474: vxxxxx#19 nsp=1 mass=400 + 0.000000000000000e+00, 4.242640687119285e-01, // itest=474: vxxxxx#19 nsp=1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=474: vxxxxx#19 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=475: vxxxxx#19 nsp=1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=475: vxxxxx#19 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=475: vxxxxx#19 nsp=1 mass=-400 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=475: vxxxxx#19 nsp=1 mass=-400 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=475: vxxxxx#19 nsp=1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=475: vxxxxx#19 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=476: sxxxxx#19 nsp=1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=476: sxxxxx#19 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=476: sxxxxx#19 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=476: sxxxxx#19 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=476: sxxxxx#19 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=476: sxxxxx#19 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=477: sxxxxx#19 nsp=1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=477: sxxxxx#19 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=477: sxxxxx#19 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=477: sxxxxx#19 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=477: sxxxxx#19 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=477: sxxxxx#19 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=478: oxxxxx#19 nsp=1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=478: oxxxxx#19 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=478: oxxxxx#19 nsp=1 mass=400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=478: oxxxxx#19 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=478: oxxxxx#19 nsp=1 mass=400 + 5.999999999999999e+00, -7.999999999999999e+00 } ); // itest=478: oxxxxx#19 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=479: oxxxxx#19 nsp=1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=479: oxxxxx#19 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=479: oxxxxx#19 nsp=1 mass=-400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=479: oxxxxx#19 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=479: oxxxxx#19 nsp=1 mass=-400 + -5.999999999999999e+00, 7.999999999999999e+00 } ); // itest=479: oxxxxx#19 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=480: ixxxxx#20 nsp=1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=480: ixxxxx#20 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=480: ixxxxx#20 nsp=1 mass=400 + -7.999999999999999e+00, -5.999999999999999e+00, // itest=480: ixxxxx#20 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=480: ixxxxx#20 nsp=1 mass=400 + -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=480: ixxxxx#20 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=481: ixxxxx#20 nsp=1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=481: ixxxxx#20 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=481: ixxxxx#20 nsp=1 mass=-400 + 7.999999999999999e+00, 5.999999999999999e+00, // itest=481: ixxxxx#20 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=481: ixxxxx#20 nsp=1 mass=-400 + -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=481: ixxxxx#20 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=482: vxxxxx#20 nsp=1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=482: vxxxxx#20 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=482: vxxxxx#20 nsp=1 mass=400 + 0.000000000000000e+00, 4.242640687119285e-01, // itest=482: vxxxxx#20 nsp=1 mass=400 + 0.000000000000000e+00, -5.656854249492381e-01, // itest=482: vxxxxx#20 nsp=1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=482: vxxxxx#20 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=483: vxxxxx#20 nsp=1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=483: vxxxxx#20 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=483: vxxxxx#20 nsp=1 mass=-400 + 0.000000000000000e+00, 4.242640687119285e-01, // itest=483: vxxxxx#20 nsp=1 mass=-400 + 0.000000000000000e+00, -5.656854249492381e-01, // itest=483: vxxxxx#20 nsp=1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=483: vxxxxx#20 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=484: sxxxxx#20 nsp=1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=484: sxxxxx#20 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=484: sxxxxx#20 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=484: sxxxxx#20 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=484: sxxxxx#20 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=484: sxxxxx#20 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=485: sxxxxx#20 nsp=1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=485: sxxxxx#20 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=485: sxxxxx#20 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=485: sxxxxx#20 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=485: sxxxxx#20 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=485: sxxxxx#20 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=486: oxxxxx#20 nsp=1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=486: oxxxxx#20 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=486: oxxxxx#20 nsp=1 mass=400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=486: oxxxxx#20 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=486: oxxxxx#20 nsp=1 mass=400 + -7.999999999999999e+00, 5.999999999999999e+00 } ); // itest=486: oxxxxx#20 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=487: oxxxxx#20 nsp=1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=487: oxxxxx#20 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=487: oxxxxx#20 nsp=1 mass=-400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=487: oxxxxx#20 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=487: oxxxxx#20 nsp=1 mass=-400 + 7.999999999999999e+00, -5.999999999999999e+00 } ); // itest=487: oxxxxx#20 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=488: ixxxxx#21 nsp=1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=488: ixxxxx#21 nsp=1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00, // itest=488: ixxxxx#21 nsp=1 mass=400 + 4.931969619160719e+00, 5.260767593771432e+00, // itest=488: ixxxxx#21 nsp=1 mass=400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=488: ixxxxx#21 nsp=1 mass=400 + 9.863939238321439e+00, 1.052153518754287e+01 } ); // itest=488: ixxxxx#21 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=489: ixxxxx#21 nsp=1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=489: ixxxxx#21 nsp=1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00, // itest=489: ixxxxx#21 nsp=1 mass=-400 + -4.931969619160719e+00, -5.260767593771432e+00, // itest=489: ixxxxx#21 nsp=1 mass=-400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=489: ixxxxx#21 nsp=1 mass=-400 + 9.863939238321439e+00, 1.052153518754287e+01 } ); // itest=489: ixxxxx#21 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=490: vxxxxx#21 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=490: vxxxxx#21 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=490: vxxxxx#21 nsp=1 mass=400 + -2.321373168788980e-01, -5.158607041753289e-01, // itest=490: vxxxxx#21 nsp=1 mass=400 + -2.476131380041579e-01, 4.836194101643708e-01, // itest=490: vxxxxx#21 nsp=1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=490: vxxxxx#21 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=491: vxxxxx#21 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=491: vxxxxx#21 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=491: vxxxxx#21 nsp=1 mass=-400 + -2.321373168788980e-01, -5.158607041753289e-01, // itest=491: vxxxxx#21 nsp=1 mass=-400 + -2.476131380041579e-01, 4.836194101643708e-01, // itest=491: vxxxxx#21 nsp=1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=491: vxxxxx#21 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=492: sxxxxx#21 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=492: sxxxxx#21 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=492: sxxxxx#21 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=492: sxxxxx#21 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=492: sxxxxx#21 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=492: sxxxxx#21 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=493: sxxxxx#21 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=493: sxxxxx#21 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=493: sxxxxx#21 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=493: sxxxxx#21 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=493: sxxxxx#21 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=493: sxxxxx#21 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=494: oxxxxx#21 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=494: oxxxxx#21 nsp=1 mass=400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=494: oxxxxx#21 nsp=1 mass=400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=494: oxxxxx#21 nsp=1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00, // itest=494: oxxxxx#21 nsp=1 mass=400 + 4.931969619160719e+00, -5.260767593771432e+00 } ); // itest=494: oxxxxx#21 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=495: oxxxxx#21 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=495: oxxxxx#21 nsp=1 mass=-400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=495: oxxxxx#21 nsp=1 mass=-400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=495: oxxxxx#21 nsp=1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00, // itest=495: oxxxxx#21 nsp=1 mass=-400 + -4.931969619160719e+00, 5.260767593771432e+00 } ); // itest=495: oxxxxx#21 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=496: ixxxxx#22 nsp=1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=496: ixxxxx#22 nsp=1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00, // itest=496: ixxxxx#22 nsp=1 mass=400 + 8.320502943378436e+00, 8.875203139603666e+00, // itest=496: ixxxxx#22 nsp=1 mass=400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=496: ixxxxx#22 nsp=1 mass=400 + 1.664100588675688e+01, 1.775040627920733e+01 } ); // itest=496: ixxxxx#22 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=497: ixxxxx#22 nsp=1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=497: ixxxxx#22 nsp=1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00, // itest=497: ixxxxx#22 nsp=1 mass=-400 + -8.320502943378436e+00, -8.875203139603666e+00, // itest=497: ixxxxx#22 nsp=1 mass=-400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=497: ixxxxx#22 nsp=1 mass=-400 + 1.664100588675688e+01, 1.775040627920733e+01 } ); // itest=497: ixxxxx#22 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=498: vxxxxx#22 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=498: vxxxxx#22 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=498: vxxxxx#22 nsp=1 mass=400 + 2.321373168788980e-01, -5.158607041753289e-01, // itest=498: vxxxxx#22 nsp=1 mass=400 + 2.476131380041579e-01, 4.836194101643708e-01, // itest=498: vxxxxx#22 nsp=1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=498: vxxxxx#22 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=499: vxxxxx#22 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=499: vxxxxx#22 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=499: vxxxxx#22 nsp=1 mass=-400 + 2.321373168788980e-01, -5.158607041753289e-01, // itest=499: vxxxxx#22 nsp=1 mass=-400 + 2.476131380041579e-01, 4.836194101643708e-01, // itest=499: vxxxxx#22 nsp=1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=499: vxxxxx#22 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=500: sxxxxx#22 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=500: sxxxxx#22 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=500: sxxxxx#22 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=500: sxxxxx#22 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=500: sxxxxx#22 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=500: sxxxxx#22 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=501: sxxxxx#22 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=501: sxxxxx#22 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=501: sxxxxx#22 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=501: sxxxxx#22 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=501: sxxxxx#22 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=501: sxxxxx#22 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=502: oxxxxx#22 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=502: oxxxxx#22 nsp=1 mass=400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=502: oxxxxx#22 nsp=1 mass=400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=502: oxxxxx#22 nsp=1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00, // itest=502: oxxxxx#22 nsp=1 mass=400 + 8.320502943378436e+00, -8.875203139603666e+00 } ); // itest=502: oxxxxx#22 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=503: oxxxxx#22 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=503: oxxxxx#22 nsp=1 mass=-400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=503: oxxxxx#22 nsp=1 mass=-400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=503: oxxxxx#22 nsp=1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00, // itest=503: oxxxxx#22 nsp=1 mass=-400 + -8.320502943378436e+00, 8.875203139603666e+00 } ); // itest=503: oxxxxx#22 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=504: ixxxxx#23 nsp=1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=504: ixxxxx#23 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=504: ixxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=504: ixxxxx#23 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=504: ixxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=504: ixxxxx#23 nsp=1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=505: ixxxxx#23 nsp=1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=505: ixxxxx#23 nsp=1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=505: ixxxxx#23 nsp=1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=505: ixxxxx#23 nsp=1 mass=-500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=505: ixxxxx#23 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=505: ixxxxx#23 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=506: vxxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=506: vxxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=506: vxxxxx#23 nsp=1 mass=500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=506: vxxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=506: vxxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=506: vxxxxx#23 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=507: vxxxxx#23 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=507: vxxxxx#23 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=507: vxxxxx#23 nsp=1 mass=-500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=507: vxxxxx#23 nsp=1 mass=-500 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=507: vxxxxx#23 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=507: vxxxxx#23 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=508: sxxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=508: sxxxxx#23 nsp=1 mass=500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=508: sxxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=508: sxxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=508: sxxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=508: sxxxxx#23 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=509: sxxxxx#23 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=509: sxxxxx#23 nsp=1 mass=-500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=509: sxxxxx#23 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=509: sxxxxx#23 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=509: sxxxxx#23 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=509: sxxxxx#23 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=510: oxxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=510: oxxxxx#23 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=510: oxxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=510: oxxxxx#23 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=510: oxxxxx#23 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=510: oxxxxx#23 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=511: oxxxxx#23 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=511: oxxxxx#23 nsp=1 mass=-500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=511: oxxxxx#23 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=511: oxxxxx#23 nsp=1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=511: oxxxxx#23 nsp=1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=511: oxxxxx#23 nsp=1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=512: ixxxxx#24 nsp=1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=512: ixxxxx#24 nsp=1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=512: ixxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=512: ixxxxx#24 nsp=1 mass=400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=512: ixxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=512: ixxxxx#24 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=513: ixxxxx#24 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=513: ixxxxx#24 nsp=1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=513: ixxxxx#24 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=513: ixxxxx#24 nsp=1 mass=-400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=513: ixxxxx#24 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=513: ixxxxx#24 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=514: vxxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=514: vxxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=514: vxxxxx#24 nsp=1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=514: vxxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=514: vxxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=514: vxxxxx#24 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=515: vxxxxx#24 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=515: vxxxxx#24 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=515: vxxxxx#24 nsp=1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=515: vxxxxx#24 nsp=1 mass=-400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=515: vxxxxx#24 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=515: vxxxxx#24 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=516: sxxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=516: sxxxxx#24 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=516: sxxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=516: sxxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=516: sxxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=516: sxxxxx#24 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=517: sxxxxx#24 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=517: sxxxxx#24 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=517: sxxxxx#24 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=517: sxxxxx#24 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=517: sxxxxx#24 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=517: sxxxxx#24 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=518: oxxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=518: oxxxxx#24 nsp=1 mass=400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=518: oxxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=518: oxxxxx#24 nsp=1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=518: oxxxxx#24 nsp=1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=518: oxxxxx#24 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=519: oxxxxx#24 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=519: oxxxxx#24 nsp=1 mass=-400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=519: oxxxxx#24 nsp=1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=519: oxxxxx#24 nsp=1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=519: oxxxxx#24 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=519: oxxxxx#24 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=520: ixxxxx#25 nsp=1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=520: ixxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=520: ixxxxx#25 nsp=1 mass=400 + -1.414213562373095e+01, 0.000000000000000e+00, // itest=520: ixxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=520: ixxxxx#25 nsp=1 mass=400 + -2.828427124746190e+01, 0.000000000000000e+00 } ); // itest=520: ixxxxx#25 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 3.000000000000000e+02, // itest=521: ixxxxx#25 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=521: ixxxxx#25 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=521: ixxxxx#25 nsp=1 mass=-400 + 1.414213562373095e+01, -0.000000000000000e+00, // itest=521: ixxxxx#25 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=521: ixxxxx#25 nsp=1 mass=-400 + -2.828427124746190e+01, 0.000000000000000e+00 } ); // itest=521: ixxxxx#25 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=522: vxxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=522: vxxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=522: vxxxxx#25 nsp=1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=522: vxxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=522: vxxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=522: vxxxxx#25 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=523: vxxxxx#25 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=523: vxxxxx#25 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=523: vxxxxx#25 nsp=1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=523: vxxxxx#25 nsp=1 mass=-400 + 0.000000000000000e+00, -7.071067811865476e-01, // itest=523: vxxxxx#25 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=523: vxxxxx#25 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=524: sxxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=524: sxxxxx#25 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=524: sxxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=524: sxxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=524: sxxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=524: sxxxxx#25 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=525: sxxxxx#25 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=525: sxxxxx#25 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=525: sxxxxx#25 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=525: sxxxxx#25 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=525: sxxxxx#25 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=525: sxxxxx#25 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=526: oxxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=526: oxxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=526: oxxxxx#25 nsp=1 mass=400 + -2.828427124746190e+01, 0.000000000000000e+00, // itest=526: oxxxxx#25 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=526: oxxxxx#25 nsp=1 mass=400 + -1.414213562373095e+01, 0.000000000000000e+00 } ); // itest=526: oxxxxx#25 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -3.000000000000000e+02, // itest=527: oxxxxx#25 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=527: oxxxxx#25 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=527: oxxxxx#25 nsp=1 mass=-400 + -2.828427124746190e+01, 0.000000000000000e+00, // itest=527: oxxxxx#25 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=527: oxxxxx#25 nsp=1 mass=-400 + 1.414213562373095e+01, -0.000000000000000e+00 } ); // itest=527: oxxxxx#25 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=528: ixxxxx#26 nsp=1 mass=400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=528: ixxxxx#26 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=528: ixxxxx#26 nsp=1 mass=400 + 5.999999999999999e+00, 7.999999999999999e+00, // itest=528: ixxxxx#26 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=528: ixxxxx#26 nsp=1 mass=400 + 1.200000000000000e+01, 1.600000000000000e+01 } ); // itest=528: ixxxxx#26 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=529: ixxxxx#26 nsp=1 mass=-400 + -1.800000000000000e+02, -2.400000000000000e+02, // itest=529: ixxxxx#26 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=529: ixxxxx#26 nsp=1 mass=-400 + -5.999999999999999e+00, -7.999999999999999e+00, // itest=529: ixxxxx#26 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=529: ixxxxx#26 nsp=1 mass=-400 + 1.200000000000000e+01, 1.600000000000000e+01 } ); // itest=529: ixxxxx#26 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=530: vxxxxx#26 nsp=1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=530: vxxxxx#26 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=530: vxxxxx#26 nsp=1 mass=400 + 0.000000000000000e+00, -5.656854249492381e-01, // itest=530: vxxxxx#26 nsp=1 mass=400 + 0.000000000000000e+00, 4.242640687119285e-01, // itest=530: vxxxxx#26 nsp=1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=530: vxxxxx#26 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=531: vxxxxx#26 nsp=1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=531: vxxxxx#26 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=531: vxxxxx#26 nsp=1 mass=-400 + -0.000000000000000e+00, -5.656854249492381e-01, // itest=531: vxxxxx#26 nsp=1 mass=-400 + -0.000000000000000e+00, 4.242640687119285e-01, // itest=531: vxxxxx#26 nsp=1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=531: vxxxxx#26 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=532: sxxxxx#26 nsp=1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=532: sxxxxx#26 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=532: sxxxxx#26 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=532: sxxxxx#26 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=532: sxxxxx#26 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=532: sxxxxx#26 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=533: sxxxxx#26 nsp=1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=533: sxxxxx#26 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=533: sxxxxx#26 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=533: sxxxxx#26 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=533: sxxxxx#26 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=533: sxxxxx#26 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=534: oxxxxx#26 nsp=1 mass=400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=534: oxxxxx#26 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=534: oxxxxx#26 nsp=1 mass=400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=534: oxxxxx#26 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=534: oxxxxx#26 nsp=1 mass=400 + 5.999999999999999e+00, -7.999999999999999e+00 } ); // itest=534: oxxxxx#26 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=535: oxxxxx#26 nsp=1 mass=-400 + 1.800000000000000e+02, 2.400000000000000e+02, // itest=535: oxxxxx#26 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=535: oxxxxx#26 nsp=1 mass=-400 + 1.200000000000000e+01, -1.600000000000000e+01, // itest=535: oxxxxx#26 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=535: oxxxxx#26 nsp=1 mass=-400 + -5.999999999999999e+00, 7.999999999999999e+00 } ); // itest=535: oxxxxx#26 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=536: ixxxxx#27 nsp=1 mass=400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=536: ixxxxx#27 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=536: ixxxxx#27 nsp=1 mass=400 + -7.999999999999999e+00, -5.999999999999999e+00, // itest=536: ixxxxx#27 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=536: ixxxxx#27 nsp=1 mass=400 + -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=536: ixxxxx#27 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=537: ixxxxx#27 nsp=1 mass=-400 + 2.400000000000000e+02, 1.800000000000000e+02, // itest=537: ixxxxx#27 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=537: ixxxxx#27 nsp=1 mass=-400 + 7.999999999999999e+00, 5.999999999999999e+00, // itest=537: ixxxxx#27 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=537: ixxxxx#27 nsp=1 mass=-400 + -1.600000000000000e+01, -1.200000000000000e+01 } ); // itest=537: ixxxxx#27 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=538: vxxxxx#27 nsp=1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=538: vxxxxx#27 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=538: vxxxxx#27 nsp=1 mass=400 + 0.000000000000000e+00, 4.242640687119285e-01, // itest=538: vxxxxx#27 nsp=1 mass=400 + 0.000000000000000e+00, -5.656854249492381e-01, // itest=538: vxxxxx#27 nsp=1 mass=400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=538: vxxxxx#27 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=539: vxxxxx#27 nsp=1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=539: vxxxxx#27 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=539: vxxxxx#27 nsp=1 mass=-400 + 0.000000000000000e+00, 4.242640687119285e-01, // itest=539: vxxxxx#27 nsp=1 mass=-400 + 0.000000000000000e+00, -5.656854249492381e-01, // itest=539: vxxxxx#27 nsp=1 mass=-400 + 7.071067811865476e-01, 0.000000000000000e+00 } ); // itest=539: vxxxxx#27 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=540: sxxxxx#27 nsp=1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=540: sxxxxx#27 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=540: sxxxxx#27 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=540: sxxxxx#27 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=540: sxxxxx#27 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=540: sxxxxx#27 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=541: sxxxxx#27 nsp=1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=541: sxxxxx#27 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=541: sxxxxx#27 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=541: sxxxxx#27 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=541: sxxxxx#27 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=541: sxxxxx#27 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=542: oxxxxx#27 nsp=1 mass=400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=542: oxxxxx#27 nsp=1 mass=400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=542: oxxxxx#27 nsp=1 mass=400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=542: oxxxxx#27 nsp=1 mass=400 + 1.000000000000000e+01, 0.000000000000000e+00, // itest=542: oxxxxx#27 nsp=1 mass=400 + -7.999999999999999e+00, 5.999999999999999e+00 } ); // itest=542: oxxxxx#27 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=543: oxxxxx#27 nsp=1 mass=-400 + -2.400000000000000e+02, -1.800000000000000e+02, // itest=543: oxxxxx#27 nsp=1 mass=-400 + 2.000000000000000e+01, 0.000000000000000e+00, // itest=543: oxxxxx#27 nsp=1 mass=-400 + -1.600000000000000e+01, 1.200000000000000e+01, // itest=543: oxxxxx#27 nsp=1 mass=-400 + -1.000000000000000e+01, -0.000000000000000e+00, // itest=543: oxxxxx#27 nsp=1 mass=-400 + 7.999999999999999e+00, -5.999999999999999e+00 } ); // itest=543: oxxxxx#27 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=544: ixxxxx#28 nsp=1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=544: ixxxxx#28 nsp=1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00, // itest=544: ixxxxx#28 nsp=1 mass=400 + 4.931969619160719e+00, 5.260767593771432e+00, // itest=544: ixxxxx#28 nsp=1 mass=400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=544: ixxxxx#28 nsp=1 mass=400 + 9.863939238321439e+00, 1.052153518754287e+01 } ); // itest=544: ixxxxx#28 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -1.440000000000000e+02, // itest=545: ixxxxx#28 nsp=1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=545: ixxxxx#28 nsp=1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00, // itest=545: ixxxxx#28 nsp=1 mass=-400 + -4.931969619160719e+00, -5.260767593771432e+00, // itest=545: ixxxxx#28 nsp=1 mass=-400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=545: ixxxxx#28 nsp=1 mass=-400 + 9.863939238321439e+00, 1.052153518754287e+01 } ); // itest=545: ixxxxx#28 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=546: vxxxxx#28 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=546: vxxxxx#28 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=546: vxxxxx#28 nsp=1 mass=400 + -2.321373168788980e-01, -5.158607041753289e-01, // itest=546: vxxxxx#28 nsp=1 mass=400 + -2.476131380041579e-01, 4.836194101643708e-01, // itest=546: vxxxxx#28 nsp=1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=546: vxxxxx#28 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=547: vxxxxx#28 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=547: vxxxxx#28 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=547: vxxxxx#28 nsp=1 mass=-400 + -2.321373168788980e-01, -5.158607041753289e-01, // itest=547: vxxxxx#28 nsp=1 mass=-400 + -2.476131380041579e-01, 4.836194101643708e-01, // itest=547: vxxxxx#28 nsp=1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=547: vxxxxx#28 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=548: sxxxxx#28 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=548: sxxxxx#28 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=548: sxxxxx#28 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=548: sxxxxx#28 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=548: sxxxxx#28 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=548: sxxxxx#28 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=549: sxxxxx#28 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=549: sxxxxx#28 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=549: sxxxxx#28 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=549: sxxxxx#28 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=549: sxxxxx#28 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=549: sxxxxx#28 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=550: oxxxxx#28 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=550: oxxxxx#28 nsp=1 mass=400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=550: oxxxxx#28 nsp=1 mass=400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=550: oxxxxx#28 nsp=1 mass=400 + 1.216552506059644e+01, 0.000000000000000e+00, // itest=550: oxxxxx#28 nsp=1 mass=400 + 4.931969619160719e+00, -5.260767593771432e+00 } ); // itest=550: oxxxxx#28 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 1.440000000000000e+02, // itest=551: oxxxxx#28 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=551: oxxxxx#28 nsp=1 mass=-400 + 2.433105012119288e+01, 0.000000000000000e+00, // itest=551: oxxxxx#28 nsp=1 mass=-400 + 9.863939238321439e+00, -1.052153518754287e+01, // itest=551: oxxxxx#28 nsp=1 mass=-400 + -1.216552506059644e+01, -0.000000000000000e+00, // itest=551: oxxxxx#28 nsp=1 mass=-400 + -4.931969619160719e+00, 5.260767593771432e+00 } ); // itest=551: oxxxxx#28 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=552: ixxxxx#29 nsp=1 mass=400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=552: ixxxxx#29 nsp=1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00, // itest=552: ixxxxx#29 nsp=1 mass=400 + 8.320502943378436e+00, 8.875203139603666e+00, // itest=552: ixxxxx#29 nsp=1 mass=400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=552: ixxxxx#29 nsp=1 mass=400 + 1.664100588675688e+01, 1.775040627920733e+01 } ); // itest=552: ixxxxx#29 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, 1.440000000000000e+02, // itest=553: ixxxxx#29 nsp=1 mass=-400 + -1.800000000000000e+02, -1.920000000000000e+02, // itest=553: ixxxxx#29 nsp=1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00, // itest=553: ixxxxx#29 nsp=1 mass=-400 + -8.320502943378436e+00, -8.875203139603666e+00, // itest=553: ixxxxx#29 nsp=1 mass=-400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=553: ixxxxx#29 nsp=1 mass=-400 + 1.664100588675688e+01, 1.775040627920733e+01 } ); // itest=553: ixxxxx#29 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=554: vxxxxx#29 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=554: vxxxxx#29 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=554: vxxxxx#29 nsp=1 mass=400 + 2.321373168788980e-01, -5.158607041753289e-01, // itest=554: vxxxxx#29 nsp=1 mass=400 + 2.476131380041579e-01, 4.836194101643708e-01, // itest=554: vxxxxx#29 nsp=1 mass=400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=554: vxxxxx#29 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=555: vxxxxx#29 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=555: vxxxxx#29 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=555: vxxxxx#29 nsp=1 mass=-400 + 2.321373168788980e-01, -5.158607041753289e-01, // itest=555: vxxxxx#29 nsp=1 mass=-400 + 2.476131380041579e-01, 4.836194101643708e-01, // itest=555: vxxxxx#29 nsp=1 mass=-400 + 6.203224967708328e-01, 0.000000000000000e+00 } ); // itest=555: vxxxxx#29 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=556: sxxxxx#29 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=556: sxxxxx#29 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=556: sxxxxx#29 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=556: sxxxxx#29 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=556: sxxxxx#29 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=556: sxxxxx#29 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=557: sxxxxx#29 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=557: sxxxxx#29 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=557: sxxxxx#29 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=557: sxxxxx#29 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=557: sxxxxx#29 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=557: sxxxxx#29 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=558: oxxxxx#29 nsp=1 mass=400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=558: oxxxxx#29 nsp=1 mass=400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=558: oxxxxx#29 nsp=1 mass=400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=558: oxxxxx#29 nsp=1 mass=400 + 7.211102550927978e+00, 0.000000000000000e+00, // itest=558: oxxxxx#29 nsp=1 mass=400 + 8.320502943378436e+00, -8.875203139603666e+00 } ); // itest=558: oxxxxx#29 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, -1.440000000000000e+02, // itest=559: oxxxxx#29 nsp=1 mass=-400 + 1.800000000000000e+02, 1.920000000000000e+02, // itest=559: oxxxxx#29 nsp=1 mass=-400 + 1.442220510185596e+01, 0.000000000000000e+00, // itest=559: oxxxxx#29 nsp=1 mass=-400 + 1.664100588675688e+01, -1.775040627920733e+01, // itest=559: oxxxxx#29 nsp=1 mass=-400 + -7.211102550927978e+00, -0.000000000000000e+00, // itest=559: oxxxxx#29 nsp=1 mass=-400 + -8.320502943378436e+00, 8.875203139603666e+00 } ); // itest=559: oxxxxx#29 nsp=1 mass=-400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=560: ixxxxx#30 nsp=1 mass=500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=560: ixxxxx#30 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=560: ixxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=560: ixxxxx#30 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=560: ixxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=560: ixxxxx#30 nsp=1 mass=500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -0.000000000000000e+00, // itest=561: ixxxxx#30 nsp=1 mass=-500 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=561: ixxxxx#30 nsp=1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=561: ixxxxx#30 nsp=1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=561: ixxxxx#30 nsp=1 mass=-500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=561: ixxxxx#30 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=561: ixxxxx#30 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=562: vxxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=562: vxxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=562: vxxxxx#30 nsp=1 mass=500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=562: vxxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=562: vxxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=562: vxxxxx#30 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=563: vxxxxx#30 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=563: vxxxxx#30 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=563: vxxxxx#30 nsp=1 mass=-500 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=563: vxxxxx#30 nsp=1 mass=-500 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=563: vxxxxx#30 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=563: vxxxxx#30 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=564: sxxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=564: sxxxxx#30 nsp=1 mass=500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=564: sxxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=564: sxxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=564: sxxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=564: sxxxxx#30 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=565: sxxxxx#30 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=565: sxxxxx#30 nsp=1 mass=-500 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=565: sxxxxx#30 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=565: sxxxxx#30 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=565: sxxxxx#30 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=565: sxxxxx#30 nsp=1 mass=-500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=566: oxxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=566: oxxxxx#30 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=566: oxxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=566: oxxxxx#30 nsp=1 mass=500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=566: oxxxxx#30 nsp=1 mass=500 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=566: oxxxxx#30 nsp=1 mass=500 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 0.000000000000000e+00, // itest=567: oxxxxx#30 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=567: oxxxxx#30 nsp=1 mass=-500 + 2.236067977499790e+01, 0.000000000000000e+00, // itest=567: oxxxxx#30 nsp=1 mass=-500 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=567: oxxxxx#30 nsp=1 mass=-500 + -2.236067977499790e+01, 0.000000000000000e+00, // itest=567: oxxxxx#30 nsp=1 mass=-500 + -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=567: oxxxxx#30 nsp=1 mass=-500 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=568: ixxxxx#31 nsp=1 mass=400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=568: ixxxxx#31 nsp=1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=568: ixxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=568: ixxxxx#31 nsp=1 mass=400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=568: ixxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=568: ixxxxx#31 nsp=1 mass=400 + expwfs.push_back( { // --------- + -5.000000000000000e+02, -3.000000000000000e+02, // itest=569: ixxxxx#31 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=569: ixxxxx#31 nsp=1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=569: ixxxxx#31 nsp=1 mass=-400 + -0.000000000000000e+00, -0.000000000000000e+00, // itest=569: ixxxxx#31 nsp=1 mass=-400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=569: ixxxxx#31 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=569: ixxxxx#31 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=570: vxxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=570: vxxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=570: vxxxxx#31 nsp=1 mass=400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=570: vxxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=570: vxxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=570: vxxxxx#31 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=571: vxxxxx#31 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=571: vxxxxx#31 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00, // itest=571: vxxxxx#31 nsp=1 mass=-400 + -7.071067811865476e-01, 0.000000000000000e+00, // itest=571: vxxxxx#31 nsp=1 mass=-400 + 0.000000000000000e+00, 7.071067811865476e-01, // itest=571: vxxxxx#31 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=571: vxxxxx#31 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=572: sxxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=572: sxxxxx#31 nsp=1 mass=400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=572: sxxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=572: sxxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=572: sxxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=572: sxxxxx#31 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=573: sxxxxx#31 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=573: sxxxxx#31 nsp=1 mass=-400 + 1.000000000000000e+00, 0.000000000000000e+00, // itest=573: sxxxxx#31 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=573: sxxxxx#31 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=573: sxxxxx#31 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=573: sxxxxx#31 nsp=1 mass=-400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=574: oxxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=574: oxxxxx#31 nsp=1 mass=400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=574: oxxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=574: oxxxxx#31 nsp=1 mass=400 + 1.414213562373095e+01, 0.000000000000000e+00, // itest=574: oxxxxx#31 nsp=1 mass=400 + 0.000000000000000e+00, -0.000000000000000e+00 } ); // itest=574: oxxxxx#31 nsp=1 mass=400 + expwfs.push_back( { // --------- + 5.000000000000000e+02, 3.000000000000000e+02, // itest=575: oxxxxx#31 nsp=1 mass=-400 + 0.000000000000000e+00, 0.000000000000000e+00, // itest=575: oxxxxx#31 nsp=1 mass=-400 + 2.828427124746190e+01, 0.000000000000000e+00, // itest=575: oxxxxx#31 nsp=1 mass=-400 + 0.000000000000000e+00, -0.000000000000000e+00, // itest=575: oxxxxx#31 nsp=1 mass=-400 + -1.414213562373095e+01, -0.000000000000000e+00, // itest=575: oxxxxx#31 nsp=1 mass=-400 + -0.000000000000000e+00, 0.000000000000000e+00 } ); // itest=575: oxxxxx#31 nsp=1 mass=-400 diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timer.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timer.h index 14d7a4d892..962ce93cfc 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timer.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timer.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + #ifndef MGONGPUTIMER_H #define MGONGPUTIMER_H 1 diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timermap.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timermap.h index 60d8c51021..bea9ddc6b2 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timermap.h +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/timermap.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + #ifndef MGONGPUTIMERMAP_H #define MGONGPUTIMERMAP_H 1 diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/CMakeLists.txt b/epochX/cudacpp/susy_gg_tt.sa/src/CMakeLists.txt index bb6d5ee85d..651d90af80 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/CMakeLists.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/src/CMakeLists.txt @@ -1,3 +1,8 @@ +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Roiser (Feb 2022) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Roiser (2022-2023) for the MG5aMC CUDACPP plugin. + file(GLOB_RECURSE HEADERS "*.h") add_library(mg5amc_common Parameters_sm.cc read_slha.cc ${HEADERS}) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h index 7574e7f445..7c488dc254 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/HelAmps_MSSM_SLHA2.h @@ -1,6 +1,14 @@ +// Copyright (C) 2010 The ALOHA Development team and Contributors. +// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. +// Created by: J. Alwall (Sep 2010) for the MG5aMC backend. +//========================================================================== +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26 +// MadGraph5_aMC@NLO v. 3.5.3_lo_vect, 2023-12-23 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -14,12 +22,13 @@ #include "Parameters_MSSM_SLHA2.h" +#include //#include //#include //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -183,6 +192,10 @@ namespace mg5amcCpu const int ipar ) // input: particle# out of npar { mgDebug( 0, __FUNCTION__ ); + // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) + // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) + // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -193,7 +206,17 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { +#ifndef MGONGPU_CPPSIMD const fptype_sv pp = fpmin( pvec0, fpsqrt( pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3 ) ); +#else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); +#endif + // In C++ ixxxxx, use a single ip/im numbering that is valid both for pp==0 and pp>0, which have two numbering schemes in Fortran ixxxxx: + // for pp==0, Fortran sqm(0:1) has indexes 0,1 as in C++; but for Fortran pp>0, omega(2) has indexes 1,2 and not 0,1 + // NB: this is only possible in ixxxx, but in oxxxxx two different numbering schemes must be used + const int ip = ( 1 + nh ) / 2; // NB: same as in Fortran pp==0, differs from Fortran pp>0, which is (3+nh)/2 because omega(2) has indexes 1,2 + const int im = ( 1 - nh ) / 2; // NB: same as in Fortran pp==0, differs from Fortran pp>0, which is (3-nh)/2 because omega(2) has indexes 1,2 #ifndef MGONGPU_CPPSIMD if( pp == 0. ) { @@ -201,8 +224,6 @@ namespace mg5amcCpu fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0. }; // possibility of negative fermion masses //sqm[1] = ( fmass < 0. ? -abs( sqm[0] ) : abs( sqm[0] ) ); // AV: why abs here? sqm[1] = ( fmass < 0. ? -sqm[0] : sqm[0] ); // AV: removed an abs here - const int ip = ( 1 + nh ) / 2; // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++ - const int im = ( 1 - nh ) / 2; // NB: Fortran sqm(0:1) also has indexes 0,1 as in C++ fi[2] = cxmake( ip * sqm[ip], 0 ); fi[3] = cxmake( im * nsf * sqm[ip], 0 ); fi[4] = cxmake( ip * nsf * sqm[im], 0 ); @@ -214,8 +235,6 @@ namespace mg5amcCpu fptype( 1 + nsf - ( 1 - nsf ) * nh ) * (fptype)0.5 }; fptype omega[2] = { fpsqrt( pvec0 + pp ), 0. }; omega[1] = fmass / omega[0]; - const int ip = ( 1 + nh ) / 2; // NB: Fortran is (3+nh)/2 because omega(2) has indexes 1,2 and not 0,1 - const int im = ( 1 - nh ) / 2; // NB: Fortran is (3-nh)/2 because omega(2) has indexes 1,2 and not 0,1 const fptype sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype pp3 = fpmax( pp + pvec3, 0. ); const cxtype chi[2] = { cxmake( fpsqrt( pp3 * (fptype)0.5 / pp ), 0. ), @@ -226,8 +245,6 @@ namespace mg5amcCpu fi[5] = sfomega[1] * chi[ip]; } #else - const int ip = ( 1 + nh ) / 2; - const int im = ( 1 - nh ) / 2; // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses (NB: SCALAR!) @@ -243,10 +260,13 @@ namespace mg5amcCpu omega[1] = fmass / omega[0]; const fptype_v sfomega[2] = { sf[0] * omega[ip], sf[1] * omega[im] }; const fptype_v pp3 = fpmax( pp + pvec3, 0 ); - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / pp ), 0 ), + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0 ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 cxternary( ( pp3 == 0. ), cxmake( -nh, 0 ), - cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) }; + cxmake( (fptype)nh * pvec1, pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 const cxtype_v fiB_2 = sfomega[0] * chi[im]; const cxtype_v fiB_3 = sfomega[0] * chi[ip]; const cxtype_v fiB_4 = sfomega[1] * chi[im]; @@ -261,10 +281,23 @@ namespace mg5amcCpu } else { +#ifdef MGONGPU_CPPSIMD + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_sv sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: dummy sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + cxtype_sv chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), + cxternary( sqp0p3 == 0, + cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), + cxmake( (fptype)nh * pvec1, pvec2 ) / (const fptype_v)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 +#else const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), fptype_sv{ 0 }, fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); - const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), cxternary( ( sqp0p3 == 0. ), cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ), cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; + const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), + ( sqp0p3 == 0. ? cxmake( -(fptype)nhel * fpsqrt( 2. * pvec0 ), 0. ) : cxmake( (fptype)nh * pvec1, pvec2 ) / sqp0p3 ) }; +#endif if( nh == 1 ) { fi[2] = cxzero_sv(); @@ -415,6 +448,10 @@ namespace mg5amcCpu const int ipar ) // input: particle# out of npar { mgDebug( 0, __FUNCTION__ ); + // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) + // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) + // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -427,11 +464,11 @@ namespace mg5amcCpu if( vmass != 0. ) { const int nsvahl = nsv * std::abs( hel ); + const fptype hel0 = 1. - std::abs( hel ); +#ifndef MGONGPU_CPPSIMD const fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); const fptype_sv pp = fpmin( pvec0, fpsqrt( pt2 + ( pvec3 * pvec3 ) ) ); const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); - const fptype hel0 = 1. - std::abs( hel ); -#ifndef MGONGPU_CPPSIMD if( pp == 0. ) { vc[2] = cxmake( 0., 0. ); @@ -459,19 +496,25 @@ namespace mg5amcCpu } } #else + volatile fptype_sv pt2 = ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ); + volatile fptype_sv p2 = pt2 + ( pvec3 * pvec3 ); // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); + const fptype_sv pt = fpmin( pp, fpsqrt( pt2 ) ); // Branch A: pp == 0. const cxtype vcA_2 = cxmake( 0, 0 ); const cxtype vcA_3 = cxmake( -hel * sqh, 0 ); const cxtype vcA_4 = cxmake( 0, nsvahl * sqh ); const cxtype vcA_5 = cxmake( hel0, 0 ); // Branch B: pp != 0. - const fptype_v emp = pvec0 / ( vmass * pp ); + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + const fptype_v emp = pvec0 / ( vmass * ppDENOM ); // hack: dummy[ieppV] is not used if pp[ieppV]==0 const cxtype_v vcB_2 = cxmake( hel0 * pp / vmass, 0 ); - const cxtype_v vcB_5 = cxmake( hel0 * pvec3 * emp + hel * pt / pp * sqh, 0 ); + const cxtype_v vcB_5 = cxmake( hel0 * pvec3 * emp + hel * pt / ppDENOM * sqh, 0 ); // hack: dummy[ieppV] is not used if pp[ieppV]==0 // Branch B1: pp != 0. and pt != 0. - const fptype_v pzpt = pvec3 / ( pp * pt ) * sqh * hel; - const cxtype_v vcB1_3 = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -(fptype)nsvahl * pvec2 / pt * sqh ); - const cxtype_v vcB1_4 = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, (fptype)nsvahl * pvec1 / pt * sqh ); + volatile fptype_v ptDENOM = fpternary( pt != 0, pt, 1. ); // hack: ptDENOM[ieppV]=1 if pt[ieppV]==0 + const fptype_v pzpt = pvec3 / ( ppDENOM * ptDENOM ) * sqh * hel; // hack: dummy[ieppV] is not used if pp[ieppV]==0 + const cxtype_v vcB1_3 = cxmake( hel0 * pvec1 * emp - pvec1 * pzpt, -(fptype)nsvahl * pvec2 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0 + const cxtype_v vcB1_4 = cxmake( hel0 * pvec2 * emp - pvec2 * pzpt, (fptype)nsvahl * pvec1 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0 // Branch B2: pp != 0. and pt == 0. const cxtype vcB2_3 = cxmake( -hel * sqh, 0. ); const cxtype_v vcB2_4 = cxmake( 0., (fptype)nsvahl * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here @@ -487,7 +530,12 @@ namespace mg5amcCpu else { const fptype_sv& pp = pvec0; // NB: rewrite the following as in Fortran, using pp instead of pvec0 +#ifndef MGONGPU_CPPSIMD const fptype_sv pt = fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) ); +#else + volatile fptype_sv pt2 = pvec1 * pvec1 + pvec2 * pvec2; // volatile fixes #736 + const fptype_sv pt = fpsqrt( pt2 ); +#endif vc[2] = cxzero_sv(); vc[5] = cxmake( hel * pt / pp * sqh, 0. ); #ifndef MGONGPU_CPPSIMD @@ -506,9 +554,10 @@ namespace mg5amcCpu } #else // Branch A: pt != 0. - const fptype_v pzpt = pvec3 / ( pp * pt ) * sqh * hel; - const cxtype_v vcA_3 = cxmake( -pvec1 * pzpt, -(fptype)nsv * pvec2 / pt * sqh ); - const cxtype_v vcA_4 = cxmake( -pvec2 * pzpt, (fptype)nsv * pvec1 / pt * sqh ); + volatile fptype_v ptDENOM = fpternary( pt != 0, pt, 1. ); // hack: ptDENOM[ieppV]=1 if pt[ieppV]==0 + const fptype_v pzpt = pvec3 / ( pp * ptDENOM ) * sqh * hel; // hack: dummy[ieppV] is not used if pt[ieppV]==0 + const cxtype_v vcA_3 = cxmake( -pvec1 * pzpt, -(fptype)nsv * pvec2 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0 + const cxtype_v vcA_4 = cxmake( -pvec2 * pzpt, (fptype)nsv * pvec1 / ptDENOM * sqh ); // hack: dummy[ieppV] is not used if pt[ieppV]==0 // Branch B: pt == 0. const cxtype vcB_3 = cxmake( -(fptype)hel * sqh, 0 ); const cxtype_v vcB_4 = cxmake( 0, (fptype)nsv * fpternary( ( pvec3 < 0 ), -sqh, sqh ) ); // AV: removed an abs here @@ -560,6 +609,10 @@ namespace mg5amcCpu const int ipar ) // input: particle# out of npar { mgDebug( 0, __FUNCTION__ ); + // NEW IMPLEMENTATION FIXING FLOATING POINT EXCEPTIONS IN SIMD CODE (#701) + // Variables xxxDENOM are a hack to avoid division-by-0 FPE while preserving speed (#701 and #727) + // Variables xxxDENOM are declared as 'volatile' to make sure they are not optimized away on clang! (#724) + // A few additional variables are declared as 'volatile' to avoid sqrt-of-negative-number FPEs (#736) const fptype_sv& pvec0 = M_ACCESS::kernelAccessIp4IparConst( momenta, 0, ipar ); const fptype_sv& pvec1 = M_ACCESS::kernelAccessIp4IparConst( momenta, 1, ipar ); const fptype_sv& pvec2 = M_ACCESS::kernelAccessIp4IparConst( momenta, 2, ipar ); @@ -570,8 +623,8 @@ namespace mg5amcCpu const int nh = nhel * nsf; if( fmass != 0. ) { - const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); #ifndef MGONGPU_CPPSIMD + const fptype_sv pp = fpmin( pvec0, fpsqrt( ( pvec1 * pvec1 ) + ( pvec2 * pvec2 ) + ( pvec3 * pvec3 ) ) ); if( pp == 0. ) { // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! @@ -604,6 +657,8 @@ namespace mg5amcCpu fo[5] = sfomeg[0] * chi[ip]; } #else + volatile fptype_sv p2 = pvec1 * pvec1 + pvec2 * pvec2 + pvec3 * pvec3; // volatile fixes #736 + const fptype_sv pp = fpmin( pvec0, fpsqrt( p2 ) ); // Branch A: pp == 0. // NB: Do not use "abs" for floats! It returns an integer with no build warning! Use std::abs! fptype sqm[2] = { fpsqrt( std::abs( fmass ) ), 0 }; // possibility of negative fermion masses @@ -623,10 +678,13 @@ namespace mg5amcCpu const int imB = ( 1 - nh ) / 2; const fptype_v sfomeg[2] = { sf[0] * omega[ipB], sf[1] * omega[imB] }; const fptype_v pp3 = fpmax( pp + pvec3, 0. ); - const cxtype_v chi[2] = { cxmake( fpsqrt( pp3 * 0.5 / pp ), 0. ), + volatile fptype_v ppDENOM = fpternary( pp != 0, pp, 1. ); // hack: ppDENOM[ieppV]=1 if pp[ieppV]==0 + volatile fptype_v pp3DENOM = fpternary( pp3 != 0, pp3, 1. ); // hack: pp3DENOM[ieppV]=1 if pp3[ieppV]==0 + volatile fptype_v chi0r2 = pp3 * 0.5 / ppDENOM; // volatile fixes #736 + const cxtype_v chi[2] = { cxmake( fpsqrt( chi0r2 ), 0. ), // hack: dummy[ieppV] is not used if pp[ieppV]==0 ( cxternary( ( pp3 == 0. ), cxmake( -nh, 0. ), - cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * pp * pp3 ) ) ) }; + cxmake( (fptype)nh * pvec1, -pvec2 ) / fpsqrt( 2. * ppDENOM * pp3DENOM ) ) ) }; // hack: dummy[ieppV] is not used if pp[ieppV]==0 const cxtype_v foB_2 = sfomeg[1] * chi[imB]; const cxtype_v foB_3 = sfomeg[1] * chi[ipB]; const cxtype_v foB_4 = sfomeg[0] * chi[imB]; @@ -641,13 +699,23 @@ namespace mg5amcCpu } else { +#ifdef MGONGPU_CPPSIMD + volatile fptype_sv p0p3 = fpmax( pvec0 + pvec3, 0 ); // volatile fixes #736 + volatile fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. and pvec2 == 0. and pvec3 < 0. ), + fptype_sv{ 0 }, + fpsqrt( p0p3 ) * (fptype)nsf ); + volatile fptype_v sqp0p3DENOM = fpternary( sqp0p3 != 0, (fptype_sv)sqp0p3, 1. ); // hack: sqp0p3DENOM[ieppV]=1 if sqp0p3[ieppV]==0 + const cxtype_v chi[2] = { cxmake( (fptype_v)sqp0p3, 0. ), + cxternary( ( sqp0p3 == 0. ), + cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), + cxmake( (fptype)nh * pvec1, -pvec2 ) / (const fptype_sv)sqp0p3DENOM ) }; // hack: dummy[ieppV] is not used if sqp0p3[ieppV]==0 +#else const fptype_sv sqp0p3 = fpternary( ( pvec1 == 0. ) and ( pvec2 == 0. ) and ( pvec3 < 0. ), 0, fpsqrt( fpmax( pvec0 + pvec3, 0. ) ) * (fptype)nsf ); const cxtype_sv chi[2] = { cxmake( sqp0p3, 0. ), - cxternary( ( sqp0p3 == 0. ), - cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ), - cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; + ( sqp0p3 == 0. ? cxmake( -nhel, 0. ) * fpsqrt( 2. * pvec0 ) : cxmake( (fptype)nh * pvec1, -pvec2 ) / sqp0p3 ) }; +#endif if( nh == 1 ) { fo[2] = chi[0]; @@ -794,6 +862,7 @@ namespace mg5amcCpu VVV1P0_1( const fptype allV2[], const fptype allV3[], const fptype allCOUP[], + const double Ccoeff, const fptype M1, const fptype W1, fptype allV1[] ) ALWAYS_INLINE; @@ -807,6 +876,7 @@ namespace mg5amcCpu const fptype allF2[], const fptype allV3[], const fptype allCOUP[], + const double Ccoeff, fptype allvertexes[] ) ALWAYS_INLINE; //-------------------------------------------------------------------------- @@ -817,6 +887,7 @@ namespace mg5amcCpu FFV1_1( const fptype allF2[], const fptype allV3[], const fptype allCOUP[], + const double Ccoeff, const fptype M1, const fptype W1, fptype allF1[] ) ALWAYS_INLINE; @@ -829,6 +900,7 @@ namespace mg5amcCpu FFV1_2( const fptype allF1[], const fptype allV3[], const fptype allCOUP[], + const double Ccoeff, const fptype M2, const fptype W2, fptype allF2[] ) ALWAYS_INLINE; @@ -841,6 +913,7 @@ namespace mg5amcCpu VVV1P0_1( const fptype allV2[], const fptype allV3[], const fptype allCOUP[], + const double Ccoeff, const fptype M1, const fptype W1, fptype allV1[] ) @@ -879,6 +952,7 @@ namespace mg5amcCpu const fptype allF2[], const fptype allV3[], const fptype allCOUP[], + const double Ccoeff, fptype allvertexes[] ) { mgDebug( 0, __FUNCTION__ ); @@ -902,6 +976,7 @@ namespace mg5amcCpu FFV1_1( const fptype allF2[], const fptype allV3[], const fptype allCOUP[], + const double Ccoeff, const fptype M1, const fptype W1, fptype allF1[] ) @@ -933,6 +1008,7 @@ namespace mg5amcCpu FFV1_2( const fptype allF1[], const fptype allV3[], const fptype allCOUP[], + const double Ccoeff, const fptype M2, const fptype W2, fptype allF2[] ) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc index bed37ead65..88e937627f 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc @@ -1,6 +1,13 @@ +// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. +// Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. +//========================================================================== +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26 +// MadGraph5_aMC@NLO v. 3.5.3_lo_vect, 2023-12-23 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -10,6 +17,12 @@ #include #include +#ifdef MGONGPUCPP_GPUIMPL +using namespace mg5amcGpu; +#else +using namespace mg5amcCpu; +#endif + #ifndef MGONGPU_HARDCODE_PARAM // Initialize static instance diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 605dd124f9..8f3d4c4241 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -1,6 +1,13 @@ +// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. +// Created by: J. Alwall (Oct 2010) for the MG5aMC CPP backend. +//========================================================================== +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by -// MadGraph5_aMC@NLO v. 3.5.0_lo_vect, 2023-01-26 +// MadGraph5_aMC@NLO v. 3.5.3_lo_vect, 2023-12-23 // By the MadGraph5_aMC@NLO Development Team // Visit launchpad.net/madgraph5 and amcatnlo.web.cern.ch //========================================================================== @@ -20,847 +27,863 @@ #include "read_slha.h" -class Parameters_MSSM_SLHA2 +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { -public: + class Parameters_MSSM_SLHA2 + { + public: + + static Parameters_MSSM_SLHA2* getInstance(); - static Parameters_MSSM_SLHA2* getInstance(); + // Define "zero" + double zero, ZERO; - // Define "zero" - double zero, ZERO; + // Model parameters independent of aS + //double aS; // now retrieved event-by-event (as G) from Fortran (running alphas #373) + double mdl_Wsl6, mdl_Wsl5, mdl_Wsl4, mdl_Wsu6, mdl_Wsd6, mdl_Wsu5, mdl_Wsd5, mdl_Wsu4, mdl_Wsd4, mdl_Wch2, mdl_Wneu4, mdl_Wneu3, mdl_Wch1, mdl_Wneu2, mdl_Wgo, mdl_Wsn3, mdl_Wsl3, mdl_Wsn2, mdl_Wsl2, mdl_Wsn1, mdl_Wsl1, mdl_Wsu3, mdl_Wsd3, mdl_Wsu2, mdl_Wsd2, mdl_Wsu1, mdl_Wsd1, mdl_WH, mdl_WA0, mdl_WH02, mdl_WH01, mdl_WW, mdl_WZ, mdl_WT, mdl_Ryu3x3, mdl_Rye3x3, mdl_Ryd3x3, mdl_RVV2x2, mdl_RVV2x1, mdl_RVV1x2, mdl_RVV1x1, mdl_RCKM3x3, mdl_RCKM2x2, mdl_RCKM1x1, mdl_RRu6x6, mdl_RRu6x3, mdl_RRu5x5, mdl_RRu4x4, mdl_RRu3x6, mdl_RRu3x3, mdl_RRu2x2, mdl_RRu1x1, mdl_RMNS3x3, mdl_RMNS2x2, mdl_RMNS1x1, mdl_RUU2x2, mdl_RUU2x1, mdl_RUU1x2, mdl_RUU1x1, mdl_Rtu3x3, mdl_Rte3x3, mdl_Rtd3x3, mdl_RRn3x3, mdl_RRn2x2, mdl_RRn1x1, aEWM1, mdl_RRl6x6, mdl_RRl6x3, mdl_RRl5x5, mdl_RRl4x4, mdl_RRl3x6, mdl_RRl3x3, mdl_RRl2x2, mdl_RRl1x1, mdl_RNN4x4, mdl_RNN4x3, mdl_RNN4x2, mdl_RNN4x1, mdl_RNN3x4, mdl_RNN3x3, mdl_RNN3x2, mdl_RNN3x1, mdl_RNN2x4, mdl_RNN2x3, mdl_RNN2x2, mdl_RNN2x1, mdl_RNN1x4, mdl_RNN1x3, mdl_RNN1x2, mdl_RNN1x1, mdl_RmU23x3, mdl_RmU21x1, mdl_RmQ23x3, mdl_RmQ21x1, mdl_mHu2, mdl_mHd2, mdl_RMx3, mdl_RMx2, mdl_RMx1, mdl_RmL23x3, mdl_RmL21x1, mdl_RmE23x3, mdl_RmE21x1, mdl_RmD23x3, mdl_RmD21x1, mdl_Msl6, mdl_Msl4, mdl_Msu6, mdl_Msd6, mdl_Msu4, mdl_Msd4, mdl_Mch2, mdl_Mneu4, mdl_Mneu3, mdl_Mch1, mdl_Mneu2, mdl_Mneu1, mdl_Mgo, mdl_Msn3, mdl_Msl3, mdl_Msn1, mdl_Msl1, mdl_Msu3, mdl_Msd3, mdl_Msu1, mdl_Msd1, mdl_MH, mdl_MA0, mdl_MH02, mdl_MH01, mdl_MW, mdl_MZ, mdl_Mta, mdl_MT, mdl_MB, mdl_MA2, mdl_tb, mdl_RMUH, mdl_alp, mdl_RRd6x6, mdl_RRd6x3, mdl_RRd5x5, mdl_RRd4x4, mdl_RRd3x6, mdl_RRd3x3, mdl_RRd2x2, mdl_RRd1x1, mdl_Msd5, mdl_Msd2, mdl_Msu5, mdl_Msu2, mdl_Msl5, mdl_Msl2, mdl_Msn2, mdl_RmU22x2, mdl_RmQ22x2, mdl_RmL22x2, mdl_RmE22x2, mdl_RmD22x2, mdl_conjg__Rn3x3, mdl_conjg__CKM3x3, mdl_Ru4x4, mdl_Ru1x1, mdl_Rn3x3, mdl_Rn1x1, mdl_Rl4x4, mdl_Rl1x1, mdl_Rd4x4, mdl_Rd1x1, mdl_I98x11, mdl_I97x11, mdl_I96x11, mdl_I93x11, mdl_I92x11, mdl_I87x11, mdl_I82x11, mdl_I74x11, mdl_I6x44, mdl_I5x11, mdl_I53x11, mdl_I52x44, mdl_I51x11, mdl_I39x11, mdl_I31x11, mdl_I26x44, mdl_I25x11, mdl_I12x11, mdl_I102x44, mdl_I101x44, mdl_I100x44, mdl_CKM3x3, mdl_atan__tb, mdl_beta, mdl_cw, mdl_MZ__exp__2, mdl_cw__exp__2, mdl_sw, mdl_cos__beta, mdl_sin__beta, mdl_sqrt__2, mdl_sw__exp__2, mdl_cos__alp, mdl_sin__alp, mdl_ee, mdl_gp, mdl_gw, mdl_vev, mdl_vd, mdl_vu, mdl_ee__exp__2; + cxsmpl mdl_mD21x1, mdl_mD22x2, mdl_mD23x3, mdl_mE21x1, mdl_mE22x2, mdl_mE23x3, mdl_mL21x1, mdl_mL22x2, mdl_mL23x3, mdl_mQ21x1, mdl_mQ22x2, mdl_mQ23x3, mdl_mU21x1, mdl_mU22x2, mdl_mU23x3, mdl_MUH, mdl_Mx1, mdl_Mx2, mdl_Mx3, mdl_NN1x1, mdl_NN1x2, mdl_NN1x3, mdl_NN1x4, mdl_NN2x1, mdl_NN2x2, mdl_NN2x3, mdl_NN2x4, mdl_NN3x1, mdl_NN3x2, mdl_NN3x3, mdl_NN3x4, mdl_NN4x1, mdl_NN4x2, mdl_NN4x3, mdl_NN4x4, mdl_Rd3x3, mdl_Rd3x6, mdl_Rd6x3, mdl_Rd6x6, mdl_Rl3x3, mdl_Rl3x6, mdl_Rl6x3, mdl_Rl6x6, mdl_Ru3x3, mdl_Ru3x6, mdl_Ru6x3, mdl_Ru6x6, mdl_UU1x1, mdl_UU1x2, mdl_UU2x1, mdl_UU2x2, mdl_VV1x1, mdl_VV1x2, mdl_VV2x1, mdl_VV2x2, mdl_td3x3, mdl_te3x3, mdl_tu3x3, mdl_yd3x3, mdl_ye3x3, mdl_yu3x3, mdl_bb, mdl_conjg__yu3x3, mdl_I1x33, mdl_conjg__yd3x3, mdl_I10x33, mdl_I10x36, mdl_conjg__Rd3x6, mdl_I100x33, mdl_I100x36, mdl_conjg__Rd6x6, mdl_I100x63, mdl_I100x66, mdl_conjg__Rl3x6, mdl_I101x33, mdl_I101x36, mdl_conjg__Rl6x6, mdl_I101x63, mdl_I101x66, mdl_conjg__Ru3x6, mdl_I102x33, mdl_I102x36, mdl_conjg__Ru6x6, mdl_I102x63, mdl_I102x66, mdl_I11x33, mdl_I11x36, mdl_conjg__Rd3x3, mdl_I12x33, mdl_I12x36, mdl_conjg__Rd6x3, mdl_I12x63, mdl_I12x66, mdl_I13x33, mdl_I13x36, mdl_I13x63, mdl_I13x66, mdl_conjg__td3x3, mdl_I14x33, mdl_I14x36, mdl_I14x63, mdl_I14x66, mdl_I15x33, mdl_I15x36, mdl_I15x63, mdl_I15x66, mdl_I16x33, mdl_I16x36, mdl_I16x63, mdl_I16x66, mdl_I17x33, mdl_I17x36, mdl_I17x63, mdl_I17x66, mdl_I18x33, mdl_I18x36, mdl_I18x63, mdl_I18x66, mdl_I19x33, mdl_I19x36, mdl_I19x63, mdl_I19x66, mdl_I2x33, mdl_I20x33, mdl_I21x33, mdl_conjg__ye3x3, mdl_I22x33, mdl_I23x33, mdl_I23x36, mdl_conjg__Rl3x3, mdl_I24x33, mdl_conjg__Rl6x3, mdl_I24x36, mdl_I25x33, mdl_I25x36, mdl_I25x63, mdl_I25x66, mdl_I26x33, mdl_I26x36, mdl_I26x63, mdl_I26x66, mdl_I27x33, mdl_I27x36, mdl_I28x33, mdl_I28x36, mdl_I29x33, mdl_I29x36, mdl_I3x33, mdl_I3x36, mdl_I30x33, mdl_I30x36, mdl_I31x33, mdl_I31x36, mdl_I31x63, mdl_I31x66, mdl_I32x33, mdl_I32x36, mdl_I32x63, mdl_I32x66, mdl_conjg__te3x3, mdl_I33x33, mdl_I33x36, mdl_I33x63, mdl_I33x66, mdl_I34x33, mdl_I34x36, mdl_I34x63, mdl_I34x66, mdl_I35x33, mdl_I35x36, mdl_I35x63, mdl_I35x66, mdl_I36x33, mdl_I36x36, mdl_I36x63, mdl_I36x66, mdl_I37x33, mdl_I37x36, mdl_I37x63, mdl_I37x66, mdl_I38x33, mdl_I38x36, mdl_I38x63, mdl_I38x66, mdl_I39x33, mdl_I39x36, mdl_I4x33, mdl_I4x36, mdl_I40x33, mdl_I40x36, mdl_I41x33, mdl_I41x36, mdl_I42x33, mdl_I42x36, mdl_I44x33, mdl_I45x33, mdl_I45x36, mdl_I46x33, mdl_I46x36, mdl_I47x33, mdl_I47x36, mdl_I48x33, mdl_I48x36, mdl_I49x33, mdl_I49x36, mdl_I5x33, mdl_I5x36, mdl_I5x63, mdl_I5x66, mdl_conjg__Ru3x3, mdl_I50x33, mdl_conjg__Ru6x3, mdl_I50x36, mdl_I51x33, mdl_I51x36, mdl_I51x63, mdl_I51x66, mdl_I52x33, mdl_I52x36, mdl_I52x63, mdl_I52x66, mdl_I53x33, mdl_I53x36, mdl_I53x63, mdl_I53x66, mdl_conjg__tu3x3, mdl_I54x33, mdl_I54x36, mdl_I54x63, mdl_I54x66, mdl_I55x33, mdl_I55x36, mdl_I55x63, mdl_I55x66, mdl_I56x33, mdl_I56x36, mdl_I56x63, mdl_I56x66, mdl_I57x33, mdl_I57x36, mdl_I57x63, mdl_I57x66, mdl_I58x33, mdl_I58x36, mdl_I58x63, mdl_I58x66, mdl_I59x33, mdl_I59x36, mdl_I59x63, mdl_I59x66, mdl_I6x33, mdl_I6x36, mdl_I6x63, mdl_I6x66, mdl_I60x33, mdl_I60x36, mdl_I60x63, mdl_I60x66, mdl_I61x33, mdl_I61x36, mdl_I62x33, mdl_I62x36, mdl_I63x33, mdl_I63x36, mdl_I64x33, mdl_I64x36, mdl_I65x33, mdl_I65x36, mdl_I66x33, mdl_I66x36, mdl_I66x63, mdl_I66x66, mdl_I67x33, mdl_I67x36, mdl_I67x63, mdl_I67x66, mdl_I68x33, mdl_I68x36, mdl_I68x63, mdl_I68x66, mdl_I69x33, mdl_I69x36, mdl_I69x63, mdl_I69x66, mdl_I7x33, mdl_I7x36, mdl_I70x33, mdl_I70x36, mdl_I70x63, mdl_I70x66, mdl_I71x33, mdl_I71x36, mdl_I71x63, mdl_I71x66, mdl_I72x33, mdl_I72x36, mdl_I72x63, mdl_I72x66, mdl_I73x33, mdl_I73x36, mdl_I73x63, mdl_I73x66, mdl_I74x33, mdl_I74x36, mdl_I74x63, mdl_I74x66, mdl_I75x33, mdl_I75x36, mdl_I75x63, mdl_I75x66, mdl_I76x33, mdl_I76x36, mdl_I76x63, mdl_I76x66, mdl_I77x33, mdl_I77x36, mdl_I77x63, mdl_I77x66, mdl_I78x33, mdl_I78x36, mdl_I78x63, mdl_I78x66, mdl_I79x33, mdl_I79x36, mdl_I79x63, mdl_I79x66, mdl_I8x33, mdl_I8x36, mdl_I80x33, mdl_I80x36, mdl_I80x63, mdl_I80x66, mdl_I81x33, mdl_I81x36, mdl_I81x63, mdl_I81x66, mdl_I82x33, mdl_I82x36, mdl_I83x33, mdl_I83x36, mdl_I84x33, mdl_I84x36, mdl_I85x33, mdl_I85x36, mdl_I86x33, mdl_I86x36, mdl_I88x33, mdl_I89x33, mdl_I89x36, mdl_I9x33, mdl_I9x36, mdl_I90x33, mdl_I90x36, mdl_I91x33, mdl_I91x36, mdl_I92x33, mdl_I92x36, mdl_I92x63, mdl_I92x66, mdl_I93x33, mdl_I93x36, mdl_I94x33, mdl_I94x36, mdl_I94x63, mdl_I94x66, mdl_I95x33, mdl_I95x36, mdl_I96x33, mdl_I96x36, mdl_I96x63, mdl_I96x66, mdl_I97x33, mdl_I97x36, mdl_I97x63, mdl_I97x66, mdl_I98x33, mdl_I98x36, mdl_I98x63, mdl_I98x66, mdl_I99x33, mdl_complexi, mdl_conjg__NN1x1, mdl_conjg__NN1x2, mdl_conjg__NN1x3, mdl_conjg__NN1x4, mdl_conjg__NN2x1, mdl_conjg__NN2x2, mdl_conjg__NN2x3, mdl_conjg__NN2x4, mdl_conjg__NN3x1, mdl_conjg__NN3x2, mdl_conjg__NN3x3, mdl_conjg__NN3x4, mdl_conjg__NN4x1, mdl_conjg__NN4x2, mdl_conjg__NN4x3, mdl_conjg__NN4x4, mdl_conjg__UU1x1, mdl_conjg__UU1x2, mdl_conjg__UU2x1, mdl_conjg__UU2x2, mdl_conjg__VV1x1, mdl_conjg__VV1x2, mdl_conjg__VV2x1, mdl_conjg__VV2x2, mdl_conjg__MUH; - // Model parameters independent of aS - //double aS; // now retrieved event-by-event (as G) from Fortran (running alphas #373) - double mdl_Wsl6, mdl_Wsl5, mdl_Wsl4, mdl_Wsu6, mdl_Wsd6, mdl_Wsu5, mdl_Wsd5, mdl_Wsu4, mdl_Wsd4, mdl_Wch2, mdl_Wneu4, mdl_Wneu3, mdl_Wch1, mdl_Wneu2, mdl_Wgo, mdl_Wsn3, mdl_Wsl3, mdl_Wsn2, mdl_Wsl2, mdl_Wsn1, mdl_Wsl1, mdl_Wsu3, mdl_Wsd3, mdl_Wsu2, mdl_Wsd2, mdl_Wsu1, mdl_Wsd1, mdl_WH, mdl_WA0, mdl_WH02, mdl_WH01, mdl_WW, mdl_WZ, mdl_WT, mdl_Ryu3x3, mdl_Rye3x3, mdl_Ryd3x3, mdl_RVV2x2, mdl_RVV2x1, mdl_RVV1x2, mdl_RVV1x1, mdl_RCKM3x3, mdl_RCKM2x2, mdl_RCKM1x1, mdl_RRu6x6, mdl_RRu6x3, mdl_RRu5x5, mdl_RRu4x4, mdl_RRu3x6, mdl_RRu3x3, mdl_RRu2x2, mdl_RRu1x1, mdl_RMNS3x3, mdl_RMNS2x2, mdl_RMNS1x1, mdl_RUU2x2, mdl_RUU2x1, mdl_RUU1x2, mdl_RUU1x1, mdl_Rtu3x3, mdl_Rte3x3, mdl_Rtd3x3, mdl_RRn3x3, mdl_RRn2x2, mdl_RRn1x1, aEWM1, mdl_RRl6x6, mdl_RRl6x3, mdl_RRl5x5, mdl_RRl4x4, mdl_RRl3x6, mdl_RRl3x3, mdl_RRl2x2, mdl_RRl1x1, mdl_RNN4x4, mdl_RNN4x3, mdl_RNN4x2, mdl_RNN4x1, mdl_RNN3x4, mdl_RNN3x3, mdl_RNN3x2, mdl_RNN3x1, mdl_RNN2x4, mdl_RNN2x3, mdl_RNN2x2, mdl_RNN2x1, mdl_RNN1x4, mdl_RNN1x3, mdl_RNN1x2, mdl_RNN1x1, mdl_RmU23x3, mdl_RmU21x1, mdl_RmQ23x3, mdl_RmQ21x1, mdl_mHu2, mdl_mHd2, mdl_RMx3, mdl_RMx2, mdl_RMx1, mdl_RmL23x3, mdl_RmL21x1, mdl_RmE23x3, mdl_RmE21x1, mdl_RmD23x3, mdl_RmD21x1, mdl_Msl6, mdl_Msl4, mdl_Msu6, mdl_Msd6, mdl_Msu4, mdl_Msd4, mdl_Mch2, mdl_Mneu4, mdl_Mneu3, mdl_Mch1, mdl_Mneu2, mdl_Mneu1, mdl_Mgo, mdl_Msn3, mdl_Msl3, mdl_Msn1, mdl_Msl1, mdl_Msu3, mdl_Msd3, mdl_Msu1, mdl_Msd1, mdl_MH, mdl_MA0, mdl_MH02, mdl_MH01, mdl_MW, mdl_MZ, mdl_Mta, mdl_MT, mdl_MB, mdl_MA2, mdl_tb, mdl_RMUH, mdl_alp, mdl_RRd6x6, mdl_RRd6x3, mdl_RRd5x5, mdl_RRd4x4, mdl_RRd3x6, mdl_RRd3x3, mdl_RRd2x2, mdl_RRd1x1, mdl_Msd5, mdl_Msd2, mdl_Msu5, mdl_Msu2, mdl_Msl5, mdl_Msl2, mdl_Msn2, mdl_RmU22x2, mdl_RmQ22x2, mdl_RmL22x2, mdl_RmE22x2, mdl_RmD22x2, mdl_conjg__Rn3x3, mdl_conjg__CKM3x3, mdl_Ru4x4, mdl_Ru1x1, mdl_Rn3x3, mdl_Rn1x1, mdl_Rl4x4, mdl_Rl1x1, mdl_Rd4x4, mdl_Rd1x1, mdl_I98x11, mdl_I97x11, mdl_I96x11, mdl_I93x11, mdl_I92x11, mdl_I87x11, mdl_I82x11, mdl_I74x11, mdl_I6x44, mdl_I5x11, mdl_I53x11, mdl_I52x44, mdl_I51x11, mdl_I39x11, mdl_I31x11, mdl_I26x44, mdl_I25x11, mdl_I12x11, mdl_I102x44, mdl_I101x44, mdl_I100x44, mdl_CKM3x3, mdl_atan__tb, mdl_beta, mdl_cw, mdl_MZ__exp__2, mdl_cw__exp__2, mdl_sw, mdl_cos__beta, mdl_sin__beta, mdl_sqrt__2, mdl_sw__exp__2, mdl_cos__alp, mdl_sin__alp, mdl_ee, mdl_gp, mdl_gw, mdl_vev, mdl_vd, mdl_vu, mdl_ee__exp__2; - cxsmpl mdl_mD21x1, mdl_mD22x2, mdl_mD23x3, mdl_mE21x1, mdl_mE22x2, mdl_mE23x3, mdl_mL21x1, mdl_mL22x2, mdl_mL23x3, mdl_mQ21x1, mdl_mQ22x2, mdl_mQ23x3, mdl_mU21x1, mdl_mU22x2, mdl_mU23x3, mdl_MUH, mdl_Mx1, mdl_Mx2, mdl_Mx3, mdl_NN1x1, mdl_NN1x2, mdl_NN1x3, mdl_NN1x4, mdl_NN2x1, mdl_NN2x2, mdl_NN2x3, mdl_NN2x4, mdl_NN3x1, mdl_NN3x2, mdl_NN3x3, mdl_NN3x4, mdl_NN4x1, mdl_NN4x2, mdl_NN4x3, mdl_NN4x4, mdl_Rd3x3, mdl_Rd3x6, mdl_Rd6x3, mdl_Rd6x6, mdl_Rl3x3, mdl_Rl3x6, mdl_Rl6x3, mdl_Rl6x6, mdl_Ru3x3, mdl_Ru3x6, mdl_Ru6x3, mdl_Ru6x6, mdl_UU1x1, mdl_UU1x2, mdl_UU2x1, mdl_UU2x2, mdl_VV1x1, mdl_VV1x2, mdl_VV2x1, mdl_VV2x2, mdl_td3x3, mdl_te3x3, mdl_tu3x3, mdl_yd3x3, mdl_ye3x3, mdl_yu3x3, mdl_bb, mdl_conjg__yu3x3, mdl_I1x33, mdl_conjg__yd3x3, mdl_I10x33, mdl_I10x36, mdl_conjg__Rd3x6, mdl_I100x33, mdl_I100x36, mdl_conjg__Rd6x6, mdl_I100x63, mdl_I100x66, mdl_conjg__Rl3x6, mdl_I101x33, mdl_I101x36, mdl_conjg__Rl6x6, mdl_I101x63, mdl_I101x66, mdl_conjg__Ru3x6, mdl_I102x33, mdl_I102x36, mdl_conjg__Ru6x6, mdl_I102x63, mdl_I102x66, mdl_I11x33, mdl_I11x36, mdl_conjg__Rd3x3, mdl_I12x33, mdl_I12x36, mdl_conjg__Rd6x3, mdl_I12x63, mdl_I12x66, mdl_I13x33, mdl_I13x36, mdl_I13x63, mdl_I13x66, mdl_conjg__td3x3, mdl_I14x33, mdl_I14x36, mdl_I14x63, mdl_I14x66, mdl_I15x33, mdl_I15x36, mdl_I15x63, mdl_I15x66, mdl_I16x33, mdl_I16x36, mdl_I16x63, mdl_I16x66, mdl_I17x33, mdl_I17x36, mdl_I17x63, mdl_I17x66, mdl_I18x33, mdl_I18x36, mdl_I18x63, mdl_I18x66, mdl_I19x33, mdl_I19x36, mdl_I19x63, mdl_I19x66, mdl_I2x33, mdl_I20x33, mdl_I21x33, mdl_conjg__ye3x3, mdl_I22x33, mdl_I23x33, mdl_I23x36, mdl_conjg__Rl3x3, mdl_I24x33, mdl_conjg__Rl6x3, mdl_I24x36, mdl_I25x33, mdl_I25x36, mdl_I25x63, mdl_I25x66, mdl_I26x33, mdl_I26x36, mdl_I26x63, mdl_I26x66, mdl_I27x33, mdl_I27x36, mdl_I28x33, mdl_I28x36, mdl_I29x33, mdl_I29x36, mdl_I3x33, mdl_I3x36, mdl_I30x33, mdl_I30x36, mdl_I31x33, mdl_I31x36, mdl_I31x63, mdl_I31x66, mdl_I32x33, mdl_I32x36, mdl_I32x63, mdl_I32x66, mdl_conjg__te3x3, mdl_I33x33, mdl_I33x36, mdl_I33x63, mdl_I33x66, mdl_I34x33, mdl_I34x36, mdl_I34x63, mdl_I34x66, mdl_I35x33, mdl_I35x36, mdl_I35x63, mdl_I35x66, mdl_I36x33, mdl_I36x36, mdl_I36x63, mdl_I36x66, mdl_I37x33, mdl_I37x36, mdl_I37x63, mdl_I37x66, mdl_I38x33, mdl_I38x36, mdl_I38x63, mdl_I38x66, mdl_I39x33, mdl_I39x36, mdl_I4x33, mdl_I4x36, mdl_I40x33, mdl_I40x36, mdl_I41x33, mdl_I41x36, mdl_I42x33, mdl_I42x36, mdl_I44x33, mdl_I45x33, mdl_I45x36, mdl_I46x33, mdl_I46x36, mdl_I47x33, mdl_I47x36, mdl_I48x33, mdl_I48x36, mdl_I49x33, mdl_I49x36, mdl_I5x33, mdl_I5x36, mdl_I5x63, mdl_I5x66, mdl_conjg__Ru3x3, mdl_I50x33, mdl_conjg__Ru6x3, mdl_I50x36, mdl_I51x33, mdl_I51x36, mdl_I51x63, mdl_I51x66, mdl_I52x33, mdl_I52x36, mdl_I52x63, mdl_I52x66, mdl_I53x33, mdl_I53x36, mdl_I53x63, mdl_I53x66, mdl_conjg__tu3x3, mdl_I54x33, mdl_I54x36, mdl_I54x63, mdl_I54x66, mdl_I55x33, mdl_I55x36, mdl_I55x63, mdl_I55x66, mdl_I56x33, mdl_I56x36, mdl_I56x63, mdl_I56x66, mdl_I57x33, mdl_I57x36, mdl_I57x63, mdl_I57x66, mdl_I58x33, mdl_I58x36, mdl_I58x63, mdl_I58x66, mdl_I59x33, mdl_I59x36, mdl_I59x63, mdl_I59x66, mdl_I6x33, mdl_I6x36, mdl_I6x63, mdl_I6x66, mdl_I60x33, mdl_I60x36, mdl_I60x63, mdl_I60x66, mdl_I61x33, mdl_I61x36, mdl_I62x33, mdl_I62x36, mdl_I63x33, mdl_I63x36, mdl_I64x33, mdl_I64x36, mdl_I65x33, mdl_I65x36, mdl_I66x33, mdl_I66x36, mdl_I66x63, mdl_I66x66, mdl_I67x33, mdl_I67x36, mdl_I67x63, mdl_I67x66, mdl_I68x33, mdl_I68x36, mdl_I68x63, mdl_I68x66, mdl_I69x33, mdl_I69x36, mdl_I69x63, mdl_I69x66, mdl_I7x33, mdl_I7x36, mdl_I70x33, mdl_I70x36, mdl_I70x63, mdl_I70x66, mdl_I71x33, mdl_I71x36, mdl_I71x63, mdl_I71x66, mdl_I72x33, mdl_I72x36, mdl_I72x63, mdl_I72x66, mdl_I73x33, mdl_I73x36, mdl_I73x63, mdl_I73x66, mdl_I74x33, mdl_I74x36, mdl_I74x63, mdl_I74x66, mdl_I75x33, mdl_I75x36, mdl_I75x63, mdl_I75x66, mdl_I76x33, mdl_I76x36, mdl_I76x63, mdl_I76x66, mdl_I77x33, mdl_I77x36, mdl_I77x63, mdl_I77x66, mdl_I78x33, mdl_I78x36, mdl_I78x63, mdl_I78x66, mdl_I79x33, mdl_I79x36, mdl_I79x63, mdl_I79x66, mdl_I8x33, mdl_I8x36, mdl_I80x33, mdl_I80x36, mdl_I80x63, mdl_I80x66, mdl_I81x33, mdl_I81x36, mdl_I81x63, mdl_I81x66, mdl_I82x33, mdl_I82x36, mdl_I83x33, mdl_I83x36, mdl_I84x33, mdl_I84x36, mdl_I85x33, mdl_I85x36, mdl_I86x33, mdl_I86x36, mdl_I88x33, mdl_I89x33, mdl_I89x36, mdl_I9x33, mdl_I9x36, mdl_I90x33, mdl_I90x36, mdl_I91x33, mdl_I91x36, mdl_I92x33, mdl_I92x36, mdl_I92x63, mdl_I92x66, mdl_I93x33, mdl_I93x36, mdl_I94x33, mdl_I94x36, mdl_I94x63, mdl_I94x66, mdl_I95x33, mdl_I95x36, mdl_I96x33, mdl_I96x36, mdl_I96x63, mdl_I96x66, mdl_I97x33, mdl_I97x36, mdl_I97x63, mdl_I97x66, mdl_I98x33, mdl_I98x36, mdl_I98x63, mdl_I98x66, mdl_I99x33, mdl_complexi, mdl_conjg__NN1x1, mdl_conjg__NN1x2, mdl_conjg__NN1x3, mdl_conjg__NN1x4, mdl_conjg__NN2x1, mdl_conjg__NN2x2, mdl_conjg__NN2x3, mdl_conjg__NN2x4, mdl_conjg__NN3x1, mdl_conjg__NN3x2, mdl_conjg__NN3x3, mdl_conjg__NN3x4, mdl_conjg__NN4x1, mdl_conjg__NN4x2, mdl_conjg__NN4x3, mdl_conjg__NN4x4, mdl_conjg__UU1x1, mdl_conjg__UU1x2, mdl_conjg__UU2x1, mdl_conjg__UU2x2, mdl_conjg__VV1x1, mdl_conjg__VV1x2, mdl_conjg__VV2x1, mdl_conjg__VV2x2, mdl_conjg__MUH; + // Model couplings independent of aS + // (none) - // Model couplings independent of aS - // (none) + // Model parameters dependent on aS + //double mdl_sqrt__aS, G; // now computed event-by-event (running alphas #373) + //cxsmpl mdl_G__exp__2; // now computed event-by-event (running alphas #373) - // Model parameters dependent on aS - //double mdl_sqrt__aS, G; // now computed event-by-event (running alphas #373) - //cxsmpl mdl_G__exp__2; // now computed event-by-event (running alphas #373) + // Model couplings dependent on aS + //cxsmpl GC_6, GC_51; // now computed event-by-event (running alphas #373) - // Model couplings dependent on aS - //cxsmpl GC_6, GC_51; // now computed event-by-event (running alphas #373) + // Set parameters that are unchanged during the run + void setIndependentParameters( SLHAReader& slha ); - // Set parameters that are unchanged during the run - void setIndependentParameters( SLHAReader& slha ); + // Set couplings that are unchanged during the run + void setIndependentCouplings(); - // Set couplings that are unchanged during the run - void setIndependentCouplings(); + // Set parameters that are changed event by event + //void setDependentParameters(); // now computed event-by-event (running alphas #373) - // Set parameters that are changed event by event - //void setDependentParameters(); // now computed event-by-event (running alphas #373) + // Set couplings that are changed event by event + //void setDependentCouplings(); // now computed event-by-event (running alphas #373) - // Set couplings that are changed event by event - //void setDependentCouplings(); // now computed event-by-event (running alphas #373) + // Print parameters that are unchanged during the run + void printIndependentParameters(); - // Print parameters that are unchanged during the run - void printIndependentParameters(); + // Print couplings that are unchanged during the run + void printIndependentCouplings(); - // Print couplings that are unchanged during the run - void printIndependentCouplings(); + // Print parameters that are changed event by event + //void printDependentParameters(); // now computed event-by-event (running alphas #373) - // Print parameters that are changed event by event - //void printDependentParameters(); // now computed event-by-event (running alphas #373) + // Print couplings that are changed event by event + //void printDependentCouplings(); // now computed event-by-event (running alphas #373) - // Print couplings that are changed event by event - //void printDependentCouplings(); // now computed event-by-event (running alphas #373) + private: -private: + static Parameters_MSSM_SLHA2* instance; + }; - static Parameters_MSSM_SLHA2* instance; -}; +} // end namespace mg5amcGpu/mg5amcCpu #else #include #include -// Hardcoded constexpr physics parameters -namespace Parameters_MSSM_SLHA2 // keep the same name rather than HardcodedParameters_MSSM_SLHA2 for simplicity +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) + // Hardcoded constexpr physics parameters + namespace Parameters_MSSM_SLHA2 // keep the same name rather than HardcodedParameters_MSSM_SLHA2 for simplicity { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); + } + double constexpr constexpr_sqrt( double x ) + { + return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( x, x, 0 ) + : std::numeric_limits::quiet_NaN(); + } - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( double d ) + { + const int i = static_cast( d ); + return d < i ? i - 1 : i; + } - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + // Constexpr implementation of pow + constexpr double constexpr_pow( double base, double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // Model parameters independent of aS + constexpr double zero = 0; + constexpr double ZERO = 0; + constexpr double mdl_Wsl6 = 2.699061e-01; + constexpr double mdl_Wsl5 = 2.161216e-01; + constexpr double mdl_Wsl4 = 2.161216e-01; + constexpr double mdl_Wsu6 = 7.373133e+00; + constexpr double mdl_Wsd6 = 8.015663e-01; + constexpr double mdl_Wsu5 = 1.152973e+00; + constexpr double mdl_Wsd5 = 2.858123e-01; + constexpr double mdl_Wsu4 = 1.152973e+00; + constexpr double mdl_Wsd4 = 2.858123e-01; + constexpr double mdl_Wch2 = 2.486895e+00; + constexpr double mdl_Wneu4 = 2.585851e+00; + constexpr double mdl_Wneu3 = 1.915985e+00; + constexpr double mdl_Wch1 = 1.704145e-02; + constexpr double mdl_Wneu2 = 2.077700e-02; + constexpr double mdl_Wgo = 5.506754e+00; + constexpr double mdl_Wsn3 = 1.475190e-01; + constexpr double mdl_Wsl3 = 1.483273e-01; + constexpr double mdl_Wsn2 = 1.498816e-01; + constexpr double mdl_Wsl2 = 2.136822e-01; + constexpr double mdl_Wsn1 = 1.498816e-01; + constexpr double mdl_Wsl1 = 2.136822e-01; + constexpr double mdl_Wsu3 = 2.021596e+00; + constexpr double mdl_Wsd3 = 3.736276e+00; + constexpr double mdl_Wsu2 = 5.477195e+00; + constexpr double mdl_Wsd2 = 5.312788e+00; + constexpr double mdl_Wsu1 = 5.477195e+00; + constexpr double mdl_Wsd1 = 5.312788e+00; + constexpr double mdl_WH = 5.469628e-01; + constexpr double mdl_WA0 = 6.321785e-01; + constexpr double mdl_WH02 = 5.748014e-01; + constexpr double mdl_WH01 = 1.986108e-03; + constexpr double mdl_WW = 2.002822e+00; + constexpr double mdl_WZ = 2.411433e+00; + constexpr double mdl_WT = 1.561950e+00; + constexpr double mdl_Ryu3x3 = 8.928445e-01; + constexpr double mdl_Rye3x3 = 1.008908e-01; + constexpr double mdl_Ryd3x3 = 1.388402e-01; + constexpr double mdl_RVV2x2 = 9.725578e-01; + constexpr double mdl_RVV2x1 = 2.326612e-01; + constexpr double mdl_RVV1x2 = -2.326612e-01; + constexpr double mdl_RVV1x1 = 9.725578e-01; + constexpr double mdl_RCKM3x3 = 1.000000e+00; + constexpr double mdl_RCKM2x2 = 1.000000e+00; + constexpr double mdl_RCKM1x1 = 1.000000e+00; + constexpr double mdl_RRu6x6 = -5.536450e-01; + constexpr double mdl_RRu6x3 = 8.327528e-01; + constexpr double mdl_RRu5x5 = 1.000000e+00; + constexpr double mdl_RRu4x4 = 1.000000e+00; + constexpr double mdl_RRu3x6 = 8.327528e-01; + constexpr double mdl_RRu3x3 = 5.536450e-01; + constexpr double mdl_RRu2x2 = 1.000000e+00; + constexpr double mdl_RRu1x1 = 1.000000e+00; + constexpr double mdl_RMNS3x3 = 1.000000e+00; + constexpr double mdl_RMNS2x2 = 1.000000e+00; + constexpr double mdl_RMNS1x1 = 1.000000e+00; + constexpr double mdl_RUU2x2 = 9.168349e-01; + constexpr double mdl_RUU2x1 = 3.992666e-01; + constexpr double mdl_RUU1x2 = -3.992666e-01; + constexpr double mdl_RUU1x1 = 9.168349e-01; + constexpr double mdl_Rtu3x3 = -4.447525e+02; + constexpr double mdl_Rte3x3 = -2.540197e+01; + constexpr double mdl_Rtd3x3 = -1.106937e+02; + constexpr double mdl_RRn3x3 = 1.000000e+00; + constexpr double mdl_RRn2x2 = 1.000000e+00; + constexpr double mdl_RRn1x1 = 1.000000e+00; + //constexpr double aS = 1.180000e-01; // now retrieved event-by-event (as G) from Fortran (running alphas #373) + constexpr double aEWM1 = 1.279340e+02; + constexpr double mdl_RRl6x6 = -2.824872e-01; + constexpr double mdl_RRl6x3 = 9.592711e-01; + constexpr double mdl_RRl5x5 = 1.000000e+00; + constexpr double mdl_RRl4x4 = 1.000000e+00; + constexpr double mdl_RRl3x6 = 9.592711e-01; + constexpr double mdl_RRl3x3 = 2.824872e-01; + constexpr double mdl_RRl2x2 = 1.000000e+00; + constexpr double mdl_RRl1x1 = 1.000000e+00; + constexpr double mdl_RNN4x4 = -6.843778e-01; + constexpr double mdl_RNN4x3 = 6.492260e-01; + constexpr double mdl_RNN4x2 = 3.107390e-01; + constexpr double mdl_RNN4x1 = -1.165071e-01; + constexpr double mdl_RNN3x4 = 7.102270e-01; + constexpr double mdl_RNN3x3 = 6.958775e-01; + constexpr double mdl_RNN3x2 = 8.770049e-02; + constexpr double mdl_RNN3x1 = -6.033880e-02; + constexpr double mdl_RNN2x4 = 1.561507e-01; + constexpr double mdl_RNN2x3 = -2.698467e-01; + constexpr double mdl_RNN2x2 = 9.449493e-01; + constexpr double mdl_RNN2x1 = 9.935054e-02; + constexpr double mdl_RNN1x4 = -5.311861e-02; + constexpr double mdl_RNN1x3 = 1.464340e-01; + constexpr double mdl_RNN1x2 = -5.311036e-02; + constexpr double mdl_RNN1x1 = 9.863644e-01; + constexpr double mdl_RmU23x3 = 1.791371e+05; + constexpr double mdl_RmU21x1 = 2.803821e+05; + constexpr double mdl_RmQ23x3 = 2.487654e+05; + constexpr double mdl_RmQ21x1 = 2.998367e+05; + constexpr double mdl_mHu2 = -1.288001e+05; + constexpr double mdl_mHd2 = 3.233749e+04; + constexpr double mdl_RMx3 = 5.882630e+02; + constexpr double mdl_RMx2 = 1.915042e+02; + constexpr double mdl_RMx1 = 1.013965e+02; + constexpr double mdl_RmL23x3 = 3.782868e+04; + constexpr double mdl_RmL21x1 = 3.815567e+04; + constexpr double mdl_RmE23x3 = 1.796764e+04; + constexpr double mdl_RmE21x1 = 1.863063e+04; + constexpr double mdl_RmD23x3 = 2.702620e+05; + constexpr double mdl_RmD21x1 = 2.736847e+05; + constexpr double mdl_Msl6 = 2.068678e+02; + constexpr double mdl_Msl4 = 1.441028e+02; + constexpr double mdl_Msu6 = 5.857858e+02; + constexpr double mdl_Msd6 = 5.437267e+02; + constexpr double mdl_Msu4 = 5.492593e+02; + constexpr double mdl_Msd4 = 5.452285e+02; + constexpr double mdl_Mch2 = 3.799393e+02; + constexpr double mdl_Mneu4 = 3.817294e+02; + constexpr double mdl_Mneu3 = -3.637560e+02; + constexpr double mdl_Mch1 = 1.816965e+02; + constexpr double mdl_Mneu2 = 1.810882e+02; + constexpr double mdl_Mneu1 = 9.668807e+01; + constexpr double mdl_Mgo = 6.077137e+02; + constexpr double mdl_Msn3 = 1.847085e+02; + constexpr double mdl_Msl3 = 1.344909e+02; + constexpr double mdl_Msn1 = 1.852583e+02; + constexpr double mdl_Msl1 = 2.029157e+02; + constexpr double mdl_Msu3 = 3.996685e+02; + constexpr double mdl_Msd3 = 5.130652e+02; + constexpr double mdl_Msu1 = 5.611190e+02; + constexpr double mdl_Msd1 = 5.684411e+02; + constexpr double mdl_MH = 4.078790e+02; + constexpr double mdl_MA0 = 3.995839e+02; + constexpr double mdl_MH02 = 3.999601e+02; + constexpr double mdl_MH01 = 1.108991e+02; + constexpr double mdl_MW = 7.982901e+01; + constexpr double mdl_MZ = 9.118760e+01; + constexpr double mdl_Mta = 1.777000e+00; + constexpr double mdl_MT = 1.750000e+02; + constexpr double mdl_MB = 4.889917e+00; + constexpr double mdl_MA2 = 1.664391e+05; + constexpr double mdl_tb = 9.748624e+00; + constexpr double mdl_RMUH = 3.576810e+02; + constexpr double mdl_alp = -1.138252e-01; + constexpr double mdl_RRd6x6 = 9.387379e-01; + constexpr double mdl_RRd6x3 = -3.446319e-01; + constexpr double mdl_RRd5x5 = 1.000000e+00; + constexpr double mdl_RRd4x4 = 1.000000e+00; + constexpr double mdl_RRd3x6 = 3.446319e-01; + constexpr double mdl_RRd3x3 = 9.387379e-01; + constexpr double mdl_RRd2x2 = 1.000000e+00; + constexpr double mdl_RRd1x1 = 1.000000e+00; + constexpr double mdl_Msd5 = 1. * mdl_Msd4; + constexpr double mdl_Msd2 = 1. * mdl_Msd1; + constexpr double mdl_Msu5 = 1. * mdl_Msu4; + constexpr double mdl_Msu2 = 1. * mdl_Msu1; + constexpr double mdl_Msl5 = 1. * mdl_Msl4; + constexpr double mdl_Msl2 = 1. * mdl_Msl1; + constexpr double mdl_Msn2 = 1. * mdl_Msn1; + constexpr double mdl_RmU22x2 = 1. * mdl_RmU21x1; + constexpr double mdl_RmQ22x2 = 1. * mdl_RmQ21x1; + constexpr double mdl_RmL22x2 = 1. * mdl_RmL21x1; + constexpr double mdl_RmE22x2 = 1. * mdl_RmE21x1; + constexpr double mdl_RmD22x2 = 1. * mdl_RmD21x1; + constexpr double mdl_conjg__Rn3x3 = 1.; + constexpr double mdl_conjg__CKM3x3 = 1.; + constexpr double mdl_Ru4x4 = 1.; + constexpr double mdl_Ru1x1 = 1.; + constexpr double mdl_Rn3x3 = 1.; + constexpr double mdl_Rn1x1 = 1.; + constexpr double mdl_Rl4x4 = 1.; + constexpr double mdl_Rl1x1 = 1.; + constexpr double mdl_Rd4x4 = 1.; + constexpr double mdl_Rd1x1 = 1.; + constexpr double mdl_I98x11 = 1.; + constexpr double mdl_I97x11 = 1.; + constexpr double mdl_I96x11 = 1.; + constexpr double mdl_I93x11 = 1.; + constexpr double mdl_I92x11 = 1.; + constexpr double mdl_I87x11 = 1.; + constexpr double mdl_I82x11 = 1.; + constexpr double mdl_I74x11 = 1.; + constexpr double mdl_I6x44 = 1.; + constexpr double mdl_I5x11 = 1.; + constexpr double mdl_I53x11 = 1.; + constexpr double mdl_I52x44 = 1.; + constexpr double mdl_I51x11 = 1.; + constexpr double mdl_I39x11 = 1.; + constexpr double mdl_I31x11 = 1.; + constexpr double mdl_I26x44 = 1.; + constexpr double mdl_I25x11 = 1.; + constexpr double mdl_I12x11 = 1.; + constexpr double mdl_I102x44 = 1.; + constexpr double mdl_I101x44 = 1.; + constexpr double mdl_I100x44 = 1.; + constexpr double mdl_CKM3x3 = 1.; + constexpr double mdl_atan__tb = atan( mdl_tb ); + constexpr double mdl_beta = mdl_atan__tb; + constexpr double mdl_cw = mdl_MW / mdl_MZ; + constexpr cxsmpl mdl_mD21x1 = mdl_RmD21x1; + constexpr cxsmpl mdl_mD22x2 = mdl_RmD22x2; + constexpr cxsmpl mdl_mD23x3 = mdl_RmD23x3; + constexpr cxsmpl mdl_mE21x1 = mdl_RmE21x1; + constexpr cxsmpl mdl_mE22x2 = mdl_RmE22x2; + constexpr cxsmpl mdl_mE23x3 = mdl_RmE23x3; + constexpr cxsmpl mdl_mL21x1 = mdl_RmL21x1; + constexpr cxsmpl mdl_mL22x2 = mdl_RmL22x2; + constexpr cxsmpl mdl_mL23x3 = mdl_RmL23x3; + constexpr cxsmpl mdl_mQ21x1 = mdl_RmQ21x1; + constexpr cxsmpl mdl_mQ22x2 = mdl_RmQ22x2; + constexpr cxsmpl mdl_mQ23x3 = mdl_RmQ23x3; + constexpr cxsmpl mdl_mU21x1 = mdl_RmU21x1; + constexpr cxsmpl mdl_mU22x2 = mdl_RmU22x2; + constexpr cxsmpl mdl_mU23x3 = mdl_RmU23x3; + constexpr cxsmpl mdl_MUH = mdl_RMUH; + constexpr cxsmpl mdl_Mx1 = mdl_RMx1; + constexpr cxsmpl mdl_Mx2 = mdl_RMx2; + constexpr cxsmpl mdl_Mx3 = mdl_RMx3; + constexpr cxsmpl mdl_NN1x1 = mdl_RNN1x1; + constexpr cxsmpl mdl_NN1x2 = mdl_RNN1x2; + constexpr cxsmpl mdl_NN1x3 = mdl_RNN1x3; + constexpr cxsmpl mdl_NN1x4 = mdl_RNN1x4; + constexpr cxsmpl mdl_NN2x1 = mdl_RNN2x1; + constexpr cxsmpl mdl_NN2x2 = mdl_RNN2x2; + constexpr cxsmpl mdl_NN2x3 = mdl_RNN2x3; + constexpr cxsmpl mdl_NN2x4 = mdl_RNN2x4; + constexpr cxsmpl mdl_NN3x1 = mdl_RNN3x1; + constexpr cxsmpl mdl_NN3x2 = mdl_RNN3x2; + constexpr cxsmpl mdl_NN3x3 = mdl_RNN3x3; + constexpr cxsmpl mdl_NN3x4 = mdl_RNN3x4; + constexpr cxsmpl mdl_NN4x1 = mdl_RNN4x1; + constexpr cxsmpl mdl_NN4x2 = mdl_RNN4x2; + constexpr cxsmpl mdl_NN4x3 = mdl_RNN4x3; + constexpr cxsmpl mdl_NN4x4 = mdl_RNN4x4; + constexpr cxsmpl mdl_Rd3x3 = mdl_RRd3x3; + constexpr cxsmpl mdl_Rd3x6 = mdl_RRd3x6; + constexpr cxsmpl mdl_Rd6x3 = mdl_RRd6x3; + constexpr cxsmpl mdl_Rd6x6 = mdl_RRd6x6; + constexpr cxsmpl mdl_Rl3x3 = mdl_RRl3x3; + constexpr cxsmpl mdl_Rl3x6 = mdl_RRl3x6; + constexpr cxsmpl mdl_Rl6x3 = mdl_RRl6x3; + constexpr cxsmpl mdl_Rl6x6 = mdl_RRl6x6; + constexpr cxsmpl mdl_Ru3x3 = mdl_RRu3x3; + constexpr cxsmpl mdl_Ru3x6 = mdl_RRu3x6; + constexpr cxsmpl mdl_Ru6x3 = mdl_RRu6x3; + constexpr cxsmpl mdl_Ru6x6 = mdl_RRu6x6; + constexpr cxsmpl mdl_UU1x1 = mdl_RUU1x1; + constexpr cxsmpl mdl_UU1x2 = mdl_RUU1x2; + constexpr cxsmpl mdl_UU2x1 = mdl_RUU2x1; + constexpr cxsmpl mdl_UU2x2 = mdl_RUU2x2; + constexpr cxsmpl mdl_VV1x1 = mdl_RVV1x1; + constexpr cxsmpl mdl_VV1x2 = mdl_RVV1x2; + constexpr cxsmpl mdl_VV2x1 = mdl_RVV2x1; + constexpr cxsmpl mdl_VV2x2 = mdl_RVV2x2; + constexpr cxsmpl mdl_td3x3 = mdl_Rtd3x3; + constexpr cxsmpl mdl_te3x3 = mdl_Rte3x3; + constexpr cxsmpl mdl_tu3x3 = mdl_Rtu3x3; + constexpr cxsmpl mdl_yd3x3 = mdl_Ryd3x3; + constexpr cxsmpl mdl_ye3x3 = mdl_Rye3x3; + constexpr cxsmpl mdl_yu3x3 = mdl_Ryu3x3; + constexpr double mdl_MZ__exp__2 = ( ( mdl_MZ ) * ( mdl_MZ ) ); + constexpr cxsmpl mdl_bb = ( ( -mdl_mHd2 + mdl_mHu2 - mdl_MZ__exp__2 * cos( 2. * mdl_beta ) ) * tan( 2. * mdl_beta ) ) / 2.; + constexpr double mdl_cw__exp__2 = ( ( mdl_cw ) * ( mdl_cw ) ); + constexpr double mdl_sw = constexpr_sqrt( 1. - mdl_cw__exp__2 ); + constexpr double mdl_cos__beta = cos( mdl_beta ); + constexpr double mdl_sin__beta = sin( mdl_beta ); + constexpr cxsmpl mdl_conjg__yu3x3 = conj( mdl_yu3x3 ); + constexpr cxsmpl mdl_I1x33 = mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_conjg__yd3x3 = conj( mdl_yd3x3 ); + constexpr cxsmpl mdl_I10x33 = mdl_Rd3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I10x36 = mdl_Rd6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_conjg__Rd3x6 = conj( mdl_Rd3x6 ); + constexpr cxsmpl mdl_I100x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_I100x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_conjg__Rd6x6 = conj( mdl_Rd6x6 ); + constexpr cxsmpl mdl_I100x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_I100x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_conjg__Rl3x6 = conj( mdl_Rl3x6 ); + constexpr cxsmpl mdl_I101x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_I101x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_conjg__Rl6x6 = conj( mdl_Rl6x6 ); + constexpr cxsmpl mdl_I101x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_I101x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_conjg__Ru3x6 = conj( mdl_Ru3x6 ); + constexpr cxsmpl mdl_I102x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_I102x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_conjg__Ru6x6 = conj( mdl_Ru6x6 ); + constexpr cxsmpl mdl_I102x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I102x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I11x33 = mdl_Rd3x6 * mdl_yd3x3; + constexpr cxsmpl mdl_I11x36 = mdl_Rd6x6 * mdl_yd3x3; + constexpr cxsmpl mdl_conjg__Rd3x3 = conj( mdl_Rd3x3 ); + constexpr cxsmpl mdl_I12x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I12x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_conjg__Rd6x3 = conj( mdl_Rd6x3 ); + constexpr cxsmpl mdl_I12x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I12x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I13x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_I13x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_I13x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_I13x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_conjg__td3x3 = conj( mdl_td3x3 ); + constexpr cxsmpl mdl_I14x33 = mdl_Rd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I14x36 = mdl_Rd6x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I14x63 = mdl_Rd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I14x66 = mdl_Rd6x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I15x33 = mdl_Rd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I15x36 = mdl_Rd6x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I15x63 = mdl_Rd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I15x66 = mdl_Rd6x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I16x33 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I16x36 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I16x63 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I16x66 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I17x33 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I17x36 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I17x63 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I17x66 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I18x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I18x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I18x63 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I18x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I19x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I19x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I19x63 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I19x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I2x33 = mdl_yd3x3 * mdl_conjg__CKM3x3; + constexpr cxsmpl mdl_I20x33 = mdl_CKM3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I21x33 = mdl_CKM3x3 * mdl_yu3x3; + constexpr cxsmpl mdl_conjg__ye3x3 = conj( mdl_ye3x3 ); + constexpr cxsmpl mdl_I22x33 = mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I23x33 = mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I23x36 = mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_conjg__Rl3x3 = conj( mdl_Rl3x3 ); + constexpr cxsmpl mdl_I24x33 = mdl_ye3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_conjg__Rl6x3 = conj( mdl_Rl6x3 ); + constexpr cxsmpl mdl_I24x36 = mdl_ye3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I25x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I25x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I25x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I25x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I26x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_I26x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_I26x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_I26x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_I27x33 = mdl_Rl3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I27x36 = mdl_Rl6x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I28x33 = mdl_Rl3x6 * mdl_ye3x3; + constexpr cxsmpl mdl_I28x36 = mdl_Rl6x6 * mdl_ye3x3; + constexpr cxsmpl mdl_I29x33 = mdl_Rl3x3; + constexpr cxsmpl mdl_I29x36 = mdl_Rl6x3; + constexpr cxsmpl mdl_I3x33 = mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I3x36 = mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I30x33 = mdl_Rl3x6 * mdl_ye3x3; + constexpr cxsmpl mdl_I30x36 = mdl_Rl6x6 * mdl_ye3x3; + constexpr cxsmpl mdl_I31x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I31x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I31x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I31x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I32x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_I32x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; + constexpr cxsmpl mdl_I32x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_I32x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; + constexpr cxsmpl mdl_conjg__te3x3 = conj( mdl_te3x3 ); + constexpr cxsmpl mdl_I33x33 = mdl_Rl3x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I33x36 = mdl_Rl6x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I33x63 = mdl_Rl3x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I33x66 = mdl_Rl6x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I34x33 = mdl_Rl3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I34x36 = mdl_Rl6x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I34x63 = mdl_Rl3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I34x66 = mdl_Rl6x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I35x33 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I35x36 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I35x63 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I35x66 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I36x33 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I36x36 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I36x63 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I36x66 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I37x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I37x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I37x63 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I37x66 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I38x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I38x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I38x63 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I38x66 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I39x33 = mdl_Rl3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I39x36 = mdl_Rl6x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I4x33 = mdl_yd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I4x36 = mdl_yd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I40x33 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I40x36 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I41x33 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rn3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I41x36 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rn3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I42x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I42x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I44x33 = mdl_Rn3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I45x33 = mdl_Rn3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I45x36 = mdl_Rn3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I46x33 = mdl_Rn3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I46x36 = mdl_Rn3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I47x33 = mdl_Rn3x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I47x36 = mdl_Rn3x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; + constexpr cxsmpl mdl_I48x33 = mdl_Rn3x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I48x36 = mdl_Rn3x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I49x33 = mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I49x36 = mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I5x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I5x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I5x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I5x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_conjg__Ru3x3 = conj( mdl_Ru3x3 ); + constexpr cxsmpl mdl_I50x33 = mdl_yu3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_conjg__Ru6x3 = conj( mdl_Ru6x3 ); + constexpr cxsmpl mdl_I50x36 = mdl_yu3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I51x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I51x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I51x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I51x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I52x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_I52x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_I52x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I52x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I53x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I53x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I53x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I53x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_conjg__tu3x3 = conj( mdl_tu3x3 ); + constexpr cxsmpl mdl_I54x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I54x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I54x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I54x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I55x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I55x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I55x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I55x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I56x33 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I56x36 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I56x63 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I56x66 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I57x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I57x36 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I57x63 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I57x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I58x33 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I58x36 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I58x63 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I58x66 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I59x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I59x36 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I59x63 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I59x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I6x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_I6x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; + constexpr cxsmpl mdl_I6x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_I6x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; + constexpr cxsmpl mdl_I60x33 = mdl_Rd3x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I60x36 = mdl_Rd3x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I60x63 = mdl_Rd6x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I60x66 = mdl_Rd6x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I61x33 = mdl_Ru3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I61x36 = mdl_Ru6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I62x33 = mdl_Ru3x6 * mdl_yu3x3; + constexpr cxsmpl mdl_I62x36 = mdl_Ru6x6 * mdl_yu3x3; + constexpr cxsmpl mdl_I63x33 = mdl_CKM3x3 * mdl_Ru3x3; + constexpr cxsmpl mdl_I63x36 = mdl_CKM3x3 * mdl_Ru6x3; + constexpr cxsmpl mdl_I64x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I64x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I65x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3; + constexpr cxsmpl mdl_I65x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3; + constexpr cxsmpl mdl_I66x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I66x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I66x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I66x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I67x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I67x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I67x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I67x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I68x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I68x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I68x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I68x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; + constexpr cxsmpl mdl_I69x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I69x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I69x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I69x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I7x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3; + constexpr cxsmpl mdl_I7x36 = mdl_Rd6x3 * mdl_conjg__CKM3x3; + constexpr cxsmpl mdl_I70x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I70x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I70x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I70x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I71x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I71x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I71x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I71x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I72x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I72x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I72x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I72x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I73x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I73x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I73x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I73x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I74x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I74x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I74x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I74x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I75x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_I75x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; + constexpr cxsmpl mdl_I75x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I75x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; + constexpr cxsmpl mdl_I76x33 = mdl_Ru3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I76x36 = mdl_Ru6x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I76x63 = mdl_Ru3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I76x66 = mdl_Ru6x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I77x33 = mdl_Ru3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I77x36 = mdl_Ru6x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I77x63 = mdl_Ru3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I77x66 = mdl_Ru6x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; + constexpr cxsmpl mdl_I78x33 = mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I78x36 = mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I78x63 = mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I78x66 = mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I79x33 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I79x36 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I79x63 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I79x66 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I8x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I8x36 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I80x33 = mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I80x36 = mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I80x63 = mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I80x66 = mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I81x33 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I81x36 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I81x63 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I81x66 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I82x33 = mdl_CKM3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I82x36 = mdl_CKM3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I83x33 = mdl_CKM3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I83x36 = mdl_CKM3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; + constexpr cxsmpl mdl_I84x33 = mdl_CKM3x3 * mdl_yu3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I84x36 = mdl_CKM3x3 * mdl_yu3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I85x33 = mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I85x36 = mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I86x33 = mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I86x36 = mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; + constexpr cxsmpl mdl_I88x33 = mdl_ye3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I89x33 = mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I89x36 = mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I9x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3; + constexpr cxsmpl mdl_I9x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3; + constexpr cxsmpl mdl_I90x33 = mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I90x36 = mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; + constexpr cxsmpl mdl_I91x33 = mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I91x36 = mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I92x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I92x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I92x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I92x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I93x33 = mdl_Rn3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I93x36 = mdl_Rn3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I94x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I94x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I94x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I94x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I95x33 = mdl_Rl3x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I95x36 = mdl_Rl6x3 * mdl_conjg__Rn3x3; + constexpr cxsmpl mdl_I96x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I96x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; + constexpr cxsmpl mdl_I96x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I96x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; + constexpr cxsmpl mdl_I97x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I97x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; + constexpr cxsmpl mdl_I97x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I97x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; + constexpr cxsmpl mdl_I98x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I98x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; + constexpr cxsmpl mdl_I98x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I98x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; + constexpr cxsmpl mdl_I99x33 = mdl_ye3x3; + constexpr cxsmpl mdl_complexi = cxsmpl( 0., 1. ); + constexpr double mdl_sqrt__2 = constexpr_sqrt( 2. ); + constexpr double mdl_sw__exp__2 = ( ( mdl_sw ) * ( mdl_sw ) ); + constexpr cxsmpl mdl_conjg__NN1x1 = conj( mdl_NN1x1 ); + constexpr cxsmpl mdl_conjg__NN1x2 = conj( mdl_NN1x2 ); + constexpr cxsmpl mdl_conjg__NN1x3 = conj( mdl_NN1x3 ); + constexpr cxsmpl mdl_conjg__NN1x4 = conj( mdl_NN1x4 ); + constexpr cxsmpl mdl_conjg__NN2x1 = conj( mdl_NN2x1 ); + constexpr cxsmpl mdl_conjg__NN2x2 = conj( mdl_NN2x2 ); + constexpr cxsmpl mdl_conjg__NN2x3 = conj( mdl_NN2x3 ); + constexpr cxsmpl mdl_conjg__NN2x4 = conj( mdl_NN2x4 ); + constexpr cxsmpl mdl_conjg__NN3x1 = conj( mdl_NN3x1 ); + constexpr cxsmpl mdl_conjg__NN3x2 = conj( mdl_NN3x2 ); + constexpr cxsmpl mdl_conjg__NN3x3 = conj( mdl_NN3x3 ); + constexpr cxsmpl mdl_conjg__NN3x4 = conj( mdl_NN3x4 ); + constexpr cxsmpl mdl_conjg__NN4x1 = conj( mdl_NN4x1 ); + constexpr cxsmpl mdl_conjg__NN4x2 = conj( mdl_NN4x2 ); + constexpr cxsmpl mdl_conjg__NN4x3 = conj( mdl_NN4x3 ); + constexpr cxsmpl mdl_conjg__NN4x4 = conj( mdl_NN4x4 ); + constexpr cxsmpl mdl_conjg__UU1x1 = conj( mdl_UU1x1 ); + constexpr cxsmpl mdl_conjg__UU1x2 = conj( mdl_UU1x2 ); + constexpr cxsmpl mdl_conjg__UU2x1 = conj( mdl_UU2x1 ); + constexpr cxsmpl mdl_conjg__UU2x2 = conj( mdl_UU2x2 ); + constexpr cxsmpl mdl_conjg__VV1x1 = conj( mdl_VV1x1 ); + constexpr cxsmpl mdl_conjg__VV1x2 = conj( mdl_VV1x2 ); + constexpr cxsmpl mdl_conjg__VV2x1 = conj( mdl_VV2x1 ); + constexpr cxsmpl mdl_conjg__VV2x2 = conj( mdl_VV2x2 ); + constexpr double mdl_cos__alp = cos( mdl_alp ); + constexpr double mdl_sin__alp = sin( mdl_alp ); + constexpr cxsmpl mdl_conjg__MUH = conj( mdl_MUH ); + constexpr double mdl_ee = 2. * constexpr_sqrt( 1. / aEWM1 ) * constexpr_sqrt( M_PI ); + constexpr double mdl_gp = mdl_ee / mdl_cw; + constexpr double mdl_gw = mdl_ee / mdl_sw; + constexpr double mdl_vev = ( 2. * mdl_cw * mdl_MZ * mdl_sw ) / mdl_ee; + constexpr double mdl_vd = mdl_vev * mdl_cos__beta; + constexpr double mdl_vu = mdl_vev * mdl_sin__beta; + constexpr double mdl_ee__exp__2 = ( ( mdl_ee ) * ( mdl_ee ) ); + if( mdl_Mneu2 < 0 ) + mdl_Wneu2 = -abs( mdl_Wneu2 ); + if( mdl_Mneu3 < 0 ) + mdl_Wneu3 = -abs( mdl_Wneu3 ); + if( mdl_Mneu4 < 0 ) + mdl_Wneu4 = -abs( mdl_Wneu4 ); + if( mdl_Mgo < 0 ) + mdl_Wgo = -abs( mdl_Wgo ); + // Model couplings independent of aS + // (none) + + // Model parameters dependent on aS + //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) + + // Model couplings dependent on aS + //constexpr cxsmpl GC_6 = -G; // now computed event-by-event (running alphas #373) + //constexpr cxsmpl GC_51 = -( mdl_complexi * G * mdl_I51x11 ); // now computed event-by-event (running alphas #373) + + // Print parameters that are unchanged during the run + void printIndependentParameters(); + + // Print couplings that are unchanged during the run + void printIndependentCouplings(); + + // Print parameters that are changed event by event + //void printDependentParameters(); // now computed event-by-event (running alphas #373) + + // Print couplings that are changed event by event + //void printDependentCouplings(); // now computed event-by-event (running alphas #373) } - // Model parameters independent of aS - constexpr double zero = 0; - constexpr double ZERO = 0; - constexpr double mdl_Wsl6 = 2.699061e-01; - constexpr double mdl_Wsl5 = 2.161216e-01; - constexpr double mdl_Wsl4 = 2.161216e-01; - constexpr double mdl_Wsu6 = 7.373133e+00; - constexpr double mdl_Wsd6 = 8.015663e-01; - constexpr double mdl_Wsu5 = 1.152973e+00; - constexpr double mdl_Wsd5 = 2.858123e-01; - constexpr double mdl_Wsu4 = 1.152973e+00; - constexpr double mdl_Wsd4 = 2.858123e-01; - constexpr double mdl_Wch2 = 2.486895e+00; - constexpr double mdl_Wneu4_abs = 2.585851e+00; - constexpr double mdl_Wneu3_abs = 1.915985e+00; - constexpr double mdl_Wch1 = 1.704145e-02; - constexpr double mdl_Wneu2_abs = 2.077700e-02; - constexpr double mdl_Wgo_abs = 5.506754e+00; - constexpr double mdl_Wsn3 = 1.475190e-01; - constexpr double mdl_Wsl3 = 1.483273e-01; - constexpr double mdl_Wsn2 = 1.498816e-01; - constexpr double mdl_Wsl2 = 2.136822e-01; - constexpr double mdl_Wsn1 = 1.498816e-01; - constexpr double mdl_Wsl1 = 2.136822e-01; - constexpr double mdl_Wsu3 = 2.021596e+00; - constexpr double mdl_Wsd3 = 3.736276e+00; - constexpr double mdl_Wsu2 = 5.477195e+00; - constexpr double mdl_Wsd2 = 5.312788e+00; - constexpr double mdl_Wsu1 = 5.477195e+00; - constexpr double mdl_Wsd1 = 5.312788e+00; - constexpr double mdl_WH = 5.469628e-01; - constexpr double mdl_WA0 = 6.321785e-01; - constexpr double mdl_WH02 = 5.748014e-01; - constexpr double mdl_WH01 = 1.986108e-03; - constexpr double mdl_WW = 2.002822e+00; - constexpr double mdl_WZ = 2.411433e+00; - constexpr double mdl_WT = 1.561950e+00; - constexpr double mdl_Ryu3x3 = 8.928445e-01; - constexpr double mdl_Rye3x3 = 1.008908e-01; - constexpr double mdl_Ryd3x3 = 1.388402e-01; - constexpr double mdl_RVV2x2 = 9.725578e-01; - constexpr double mdl_RVV2x1 = 2.326612e-01; - constexpr double mdl_RVV1x2 = -2.326612e-01; - constexpr double mdl_RVV1x1 = 9.725578e-01; - constexpr double mdl_RCKM3x3 = 1.000000e+00; - constexpr double mdl_RCKM2x2 = 1.000000e+00; - constexpr double mdl_RCKM1x1 = 1.000000e+00; - constexpr double mdl_RRu6x6 = -5.536450e-01; - constexpr double mdl_RRu6x3 = 8.327528e-01; - constexpr double mdl_RRu5x5 = 1.000000e+00; - constexpr double mdl_RRu4x4 = 1.000000e+00; - constexpr double mdl_RRu3x6 = 8.327528e-01; - constexpr double mdl_RRu3x3 = 5.536450e-01; - constexpr double mdl_RRu2x2 = 1.000000e+00; - constexpr double mdl_RRu1x1 = 1.000000e+00; - constexpr double mdl_RMNS3x3 = 1.000000e+00; - constexpr double mdl_RMNS2x2 = 1.000000e+00; - constexpr double mdl_RMNS1x1 = 1.000000e+00; - constexpr double mdl_RUU2x2 = 9.168349e-01; - constexpr double mdl_RUU2x1 = 3.992666e-01; - constexpr double mdl_RUU1x2 = -3.992666e-01; - constexpr double mdl_RUU1x1 = 9.168349e-01; - constexpr double mdl_Rtu3x3 = -4.447525e+02; - constexpr double mdl_Rte3x3 = -2.540197e+01; - constexpr double mdl_Rtd3x3 = -1.106937e+02; - constexpr double mdl_RRn3x3 = 1.000000e+00; - constexpr double mdl_RRn2x2 = 1.000000e+00; - constexpr double mdl_RRn1x1 = 1.000000e+00; - //constexpr double aS = 1.180000e-01; // now retrieved event-by-event (as G) from Fortran (running alphas #373) - constexpr double aEWM1 = 1.279340e+02; - constexpr double mdl_RRl6x6 = -2.824872e-01; - constexpr double mdl_RRl6x3 = 9.592711e-01; - constexpr double mdl_RRl5x5 = 1.000000e+00; - constexpr double mdl_RRl4x4 = 1.000000e+00; - constexpr double mdl_RRl3x6 = 9.592711e-01; - constexpr double mdl_RRl3x3 = 2.824872e-01; - constexpr double mdl_RRl2x2 = 1.000000e+00; - constexpr double mdl_RRl1x1 = 1.000000e+00; - constexpr double mdl_RNN4x4 = -6.843778e-01; - constexpr double mdl_RNN4x3 = 6.492260e-01; - constexpr double mdl_RNN4x2 = 3.107390e-01; - constexpr double mdl_RNN4x1 = -1.165071e-01; - constexpr double mdl_RNN3x4 = 7.102270e-01; - constexpr double mdl_RNN3x3 = 6.958775e-01; - constexpr double mdl_RNN3x2 = 8.770049e-02; - constexpr double mdl_RNN3x1 = -6.033880e-02; - constexpr double mdl_RNN2x4 = 1.561507e-01; - constexpr double mdl_RNN2x3 = -2.698467e-01; - constexpr double mdl_RNN2x2 = 9.449493e-01; - constexpr double mdl_RNN2x1 = 9.935054e-02; - constexpr double mdl_RNN1x4 = -5.311861e-02; - constexpr double mdl_RNN1x3 = 1.464340e-01; - constexpr double mdl_RNN1x2 = -5.311036e-02; - constexpr double mdl_RNN1x1 = 9.863644e-01; - constexpr double mdl_RmU23x3 = 1.791371e+05; - constexpr double mdl_RmU21x1 = 2.803821e+05; - constexpr double mdl_RmQ23x3 = 2.487654e+05; - constexpr double mdl_RmQ21x1 = 2.998367e+05; - constexpr double mdl_mHu2 = -1.288001e+05; - constexpr double mdl_mHd2 = 3.233749e+04; - constexpr double mdl_RMx3 = 5.882630e+02; - constexpr double mdl_RMx2 = 1.915042e+02; - constexpr double mdl_RMx1 = 1.013965e+02; - constexpr double mdl_RmL23x3 = 3.782868e+04; - constexpr double mdl_RmL21x1 = 3.815567e+04; - constexpr double mdl_RmE23x3 = 1.796764e+04; - constexpr double mdl_RmE21x1 = 1.863063e+04; - constexpr double mdl_RmD23x3 = 2.702620e+05; - constexpr double mdl_RmD21x1 = 2.736847e+05; - constexpr double mdl_Msl6 = 2.068678e+02; - constexpr double mdl_Msl4 = 1.441028e+02; - constexpr double mdl_Msu6 = 5.857858e+02; - constexpr double mdl_Msd6 = 5.437267e+02; - constexpr double mdl_Msu4 = 5.492593e+02; - constexpr double mdl_Msd4 = 5.452285e+02; - constexpr double mdl_Mch2 = 3.799393e+02; - constexpr double mdl_Mneu4 = 3.817294e+02; - constexpr double mdl_Mneu3 = -3.637560e+02; - constexpr double mdl_Mch1 = 1.816965e+02; - constexpr double mdl_Mneu2 = 1.810882e+02; - constexpr double mdl_Mneu1 = 9.668807e+01; - constexpr double mdl_Mgo = 6.077137e+02; - constexpr double mdl_Msn3 = 1.847085e+02; - constexpr double mdl_Msl3 = 1.344909e+02; - constexpr double mdl_Msn1 = 1.852583e+02; - constexpr double mdl_Msl1 = 2.029157e+02; - constexpr double mdl_Msu3 = 3.996685e+02; - constexpr double mdl_Msd3 = 5.130652e+02; - constexpr double mdl_Msu1 = 5.611190e+02; - constexpr double mdl_Msd1 = 5.684411e+02; - constexpr double mdl_MH = 4.078790e+02; - constexpr double mdl_MA0 = 3.995839e+02; - constexpr double mdl_MH02 = 3.999601e+02; - constexpr double mdl_MH01 = 1.108991e+02; - constexpr double mdl_MW = 7.982901e+01; - constexpr double mdl_MZ = 9.118760e+01; - constexpr double mdl_Mta = 1.777000e+00; - constexpr double mdl_MT = 1.750000e+02; - constexpr double mdl_MB = 4.889917e+00; - constexpr double mdl_MA2 = 1.664391e+05; - constexpr double mdl_tb = 9.748624e+00; - constexpr double mdl_RMUH = 3.576810e+02; - constexpr double mdl_alp = -1.138252e-01; - constexpr double mdl_RRd6x6 = 9.387379e-01; - constexpr double mdl_RRd6x3 = -3.446319e-01; - constexpr double mdl_RRd5x5 = 1.000000e+00; - constexpr double mdl_RRd4x4 = 1.000000e+00; - constexpr double mdl_RRd3x6 = 3.446319e-01; - constexpr double mdl_RRd3x3 = 9.387379e-01; - constexpr double mdl_RRd2x2 = 1.000000e+00; - constexpr double mdl_RRd1x1 = 1.000000e+00; - constexpr double mdl_Msd5 = 1. * mdl_Msd4; - constexpr double mdl_Msd2 = 1. * mdl_Msd1; - constexpr double mdl_Msu5 = 1. * mdl_Msu4; - constexpr double mdl_Msu2 = 1. * mdl_Msu1; - constexpr double mdl_Msl5 = 1. * mdl_Msl4; - constexpr double mdl_Msl2 = 1. * mdl_Msl1; - constexpr double mdl_Msn2 = 1. * mdl_Msn1; - constexpr double mdl_RmU22x2 = 1. * mdl_RmU21x1; - constexpr double mdl_RmQ22x2 = 1. * mdl_RmQ21x1; - constexpr double mdl_RmL22x2 = 1. * mdl_RmL21x1; - constexpr double mdl_RmE22x2 = 1. * mdl_RmE21x1; - constexpr double mdl_RmD22x2 = 1. * mdl_RmD21x1; - constexpr double mdl_conjg__Rn3x3 = 1.; - constexpr double mdl_conjg__CKM3x3 = 1.; - constexpr double mdl_Ru4x4 = 1.; - constexpr double mdl_Ru1x1 = 1.; - constexpr double mdl_Rn3x3 = 1.; - constexpr double mdl_Rn1x1 = 1.; - constexpr double mdl_Rl4x4 = 1.; - constexpr double mdl_Rl1x1 = 1.; - constexpr double mdl_Rd4x4 = 1.; - constexpr double mdl_Rd1x1 = 1.; - constexpr double mdl_I98x11 = 1.; - constexpr double mdl_I97x11 = 1.; - constexpr double mdl_I96x11 = 1.; - constexpr double mdl_I93x11 = 1.; - constexpr double mdl_I92x11 = 1.; - constexpr double mdl_I87x11 = 1.; - constexpr double mdl_I82x11 = 1.; - constexpr double mdl_I74x11 = 1.; - constexpr double mdl_I6x44 = 1.; - constexpr double mdl_I5x11 = 1.; - constexpr double mdl_I53x11 = 1.; - constexpr double mdl_I52x44 = 1.; - constexpr double mdl_I51x11 = 1.; - constexpr double mdl_I39x11 = 1.; - constexpr double mdl_I31x11 = 1.; - constexpr double mdl_I26x44 = 1.; - constexpr double mdl_I25x11 = 1.; - constexpr double mdl_I12x11 = 1.; - constexpr double mdl_I102x44 = 1.; - constexpr double mdl_I101x44 = 1.; - constexpr double mdl_I100x44 = 1.; - constexpr double mdl_CKM3x3 = 1.; - constexpr double mdl_atan__tb = atan( mdl_tb ); - constexpr double mdl_beta = mdl_atan__tb; - constexpr double mdl_cw = mdl_MW / mdl_MZ; - constexpr cxsmpl mdl_mD21x1 = mdl_RmD21x1; - constexpr cxsmpl mdl_mD22x2 = mdl_RmD22x2; - constexpr cxsmpl mdl_mD23x3 = mdl_RmD23x3; - constexpr cxsmpl mdl_mE21x1 = mdl_RmE21x1; - constexpr cxsmpl mdl_mE22x2 = mdl_RmE22x2; - constexpr cxsmpl mdl_mE23x3 = mdl_RmE23x3; - constexpr cxsmpl mdl_mL21x1 = mdl_RmL21x1; - constexpr cxsmpl mdl_mL22x2 = mdl_RmL22x2; - constexpr cxsmpl mdl_mL23x3 = mdl_RmL23x3; - constexpr cxsmpl mdl_mQ21x1 = mdl_RmQ21x1; - constexpr cxsmpl mdl_mQ22x2 = mdl_RmQ22x2; - constexpr cxsmpl mdl_mQ23x3 = mdl_RmQ23x3; - constexpr cxsmpl mdl_mU21x1 = mdl_RmU21x1; - constexpr cxsmpl mdl_mU22x2 = mdl_RmU22x2; - constexpr cxsmpl mdl_mU23x3 = mdl_RmU23x3; - constexpr cxsmpl mdl_MUH = mdl_RMUH; - constexpr cxsmpl mdl_Mx1 = mdl_RMx1; - constexpr cxsmpl mdl_Mx2 = mdl_RMx2; - constexpr cxsmpl mdl_Mx3 = mdl_RMx3; - constexpr cxsmpl mdl_NN1x1 = mdl_RNN1x1; - constexpr cxsmpl mdl_NN1x2 = mdl_RNN1x2; - constexpr cxsmpl mdl_NN1x3 = mdl_RNN1x3; - constexpr cxsmpl mdl_NN1x4 = mdl_RNN1x4; - constexpr cxsmpl mdl_NN2x1 = mdl_RNN2x1; - constexpr cxsmpl mdl_NN2x2 = mdl_RNN2x2; - constexpr cxsmpl mdl_NN2x3 = mdl_RNN2x3; - constexpr cxsmpl mdl_NN2x4 = mdl_RNN2x4; - constexpr cxsmpl mdl_NN3x1 = mdl_RNN3x1; - constexpr cxsmpl mdl_NN3x2 = mdl_RNN3x2; - constexpr cxsmpl mdl_NN3x3 = mdl_RNN3x3; - constexpr cxsmpl mdl_NN3x4 = mdl_RNN3x4; - constexpr cxsmpl mdl_NN4x1 = mdl_RNN4x1; - constexpr cxsmpl mdl_NN4x2 = mdl_RNN4x2; - constexpr cxsmpl mdl_NN4x3 = mdl_RNN4x3; - constexpr cxsmpl mdl_NN4x4 = mdl_RNN4x4; - constexpr cxsmpl mdl_Rd3x3 = mdl_RRd3x3; - constexpr cxsmpl mdl_Rd3x6 = mdl_RRd3x6; - constexpr cxsmpl mdl_Rd6x3 = mdl_RRd6x3; - constexpr cxsmpl mdl_Rd6x6 = mdl_RRd6x6; - constexpr cxsmpl mdl_Rl3x3 = mdl_RRl3x3; - constexpr cxsmpl mdl_Rl3x6 = mdl_RRl3x6; - constexpr cxsmpl mdl_Rl6x3 = mdl_RRl6x3; - constexpr cxsmpl mdl_Rl6x6 = mdl_RRl6x6; - constexpr cxsmpl mdl_Ru3x3 = mdl_RRu3x3; - constexpr cxsmpl mdl_Ru3x6 = mdl_RRu3x6; - constexpr cxsmpl mdl_Ru6x3 = mdl_RRu6x3; - constexpr cxsmpl mdl_Ru6x6 = mdl_RRu6x6; - constexpr cxsmpl mdl_UU1x1 = mdl_RUU1x1; - constexpr cxsmpl mdl_UU1x2 = mdl_RUU1x2; - constexpr cxsmpl mdl_UU2x1 = mdl_RUU2x1; - constexpr cxsmpl mdl_UU2x2 = mdl_RUU2x2; - constexpr cxsmpl mdl_VV1x1 = mdl_RVV1x1; - constexpr cxsmpl mdl_VV1x2 = mdl_RVV1x2; - constexpr cxsmpl mdl_VV2x1 = mdl_RVV2x1; - constexpr cxsmpl mdl_VV2x2 = mdl_RVV2x2; - constexpr cxsmpl mdl_td3x3 = mdl_Rtd3x3; - constexpr cxsmpl mdl_te3x3 = mdl_Rte3x3; - constexpr cxsmpl mdl_tu3x3 = mdl_Rtu3x3; - constexpr cxsmpl mdl_yd3x3 = mdl_Ryd3x3; - constexpr cxsmpl mdl_ye3x3 = mdl_Rye3x3; - constexpr cxsmpl mdl_yu3x3 = mdl_Ryu3x3; - constexpr double mdl_MZ__exp__2 = ( ( mdl_MZ ) * ( mdl_MZ ) ); - constexpr cxsmpl mdl_bb = ( ( -mdl_mHd2 + mdl_mHu2 - mdl_MZ__exp__2 * cos( 2. * mdl_beta ) ) * tan( 2. * mdl_beta ) ) / 2.; - constexpr double mdl_cw__exp__2 = ( ( mdl_cw ) * ( mdl_cw ) ); - constexpr double mdl_sw = constexpr_sqrt( 1. - mdl_cw__exp__2 ); - constexpr double mdl_cos__beta = cos( mdl_beta ); - constexpr double mdl_sin__beta = sin( mdl_beta ); - constexpr cxsmpl mdl_conjg__yu3x3 = conj( mdl_yu3x3 ); - constexpr cxsmpl mdl_I1x33 = mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_conjg__yd3x3 = conj( mdl_yd3x3 ); - constexpr cxsmpl mdl_I10x33 = mdl_Rd3x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I10x36 = mdl_Rd6x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_conjg__Rd3x6 = conj( mdl_Rd3x6 ); - constexpr cxsmpl mdl_I100x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; - constexpr cxsmpl mdl_I100x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; - constexpr cxsmpl mdl_conjg__Rd6x6 = conj( mdl_Rd6x6 ); - constexpr cxsmpl mdl_I100x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; - constexpr cxsmpl mdl_I100x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; - constexpr cxsmpl mdl_conjg__Rl3x6 = conj( mdl_Rl3x6 ); - constexpr cxsmpl mdl_I101x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; - constexpr cxsmpl mdl_I101x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; - constexpr cxsmpl mdl_conjg__Rl6x6 = conj( mdl_Rl6x6 ); - constexpr cxsmpl mdl_I101x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; - constexpr cxsmpl mdl_I101x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; - constexpr cxsmpl mdl_conjg__Ru3x6 = conj( mdl_Ru3x6 ); - constexpr cxsmpl mdl_I102x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; - constexpr cxsmpl mdl_I102x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; - constexpr cxsmpl mdl_conjg__Ru6x6 = conj( mdl_Ru6x6 ); - constexpr cxsmpl mdl_I102x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; - constexpr cxsmpl mdl_I102x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; - constexpr cxsmpl mdl_I11x33 = mdl_Rd3x6 * mdl_yd3x3; - constexpr cxsmpl mdl_I11x36 = mdl_Rd6x6 * mdl_yd3x3; - constexpr cxsmpl mdl_conjg__Rd3x3 = conj( mdl_Rd3x3 ); - constexpr cxsmpl mdl_I12x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I12x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_conjg__Rd6x3 = conj( mdl_Rd6x3 ); - constexpr cxsmpl mdl_I12x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I12x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I13x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; - constexpr cxsmpl mdl_I13x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; - constexpr cxsmpl mdl_I13x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; - constexpr cxsmpl mdl_I13x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; - constexpr cxsmpl mdl_conjg__td3x3 = conj( mdl_td3x3 ); - constexpr cxsmpl mdl_I14x33 = mdl_Rd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; - constexpr cxsmpl mdl_I14x36 = mdl_Rd6x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; - constexpr cxsmpl mdl_I14x63 = mdl_Rd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; - constexpr cxsmpl mdl_I14x66 = mdl_Rd6x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; - constexpr cxsmpl mdl_I15x33 = mdl_Rd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I15x36 = mdl_Rd6x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I15x63 = mdl_Rd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I15x66 = mdl_Rd6x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I16x33 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I16x36 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I16x63 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I16x66 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I17x33 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I17x36 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I17x63 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I17x66 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I18x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I18x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I18x63 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I18x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I19x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I19x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I19x63 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I19x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I2x33 = mdl_yd3x3 * mdl_conjg__CKM3x3; - constexpr cxsmpl mdl_I20x33 = mdl_CKM3x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I21x33 = mdl_CKM3x3 * mdl_yu3x3; - constexpr cxsmpl mdl_conjg__ye3x3 = conj( mdl_ye3x3 ); - constexpr cxsmpl mdl_I22x33 = mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I23x33 = mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I23x36 = mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_conjg__Rl3x3 = conj( mdl_Rl3x3 ); - constexpr cxsmpl mdl_I24x33 = mdl_ye3x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_conjg__Rl6x3 = conj( mdl_Rl6x3 ); - constexpr cxsmpl mdl_I24x36 = mdl_ye3x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I25x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I25x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I25x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I25x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I26x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; - constexpr cxsmpl mdl_I26x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; - constexpr cxsmpl mdl_I26x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; - constexpr cxsmpl mdl_I26x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; - constexpr cxsmpl mdl_I27x33 = mdl_Rl3x3 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I27x36 = mdl_Rl6x3 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I28x33 = mdl_Rl3x6 * mdl_ye3x3; - constexpr cxsmpl mdl_I28x36 = mdl_Rl6x6 * mdl_ye3x3; - constexpr cxsmpl mdl_I29x33 = mdl_Rl3x3; - constexpr cxsmpl mdl_I29x36 = mdl_Rl6x3; - constexpr cxsmpl mdl_I3x33 = mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I3x36 = mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I30x33 = mdl_Rl3x6 * mdl_ye3x3; - constexpr cxsmpl mdl_I30x36 = mdl_Rl6x6 * mdl_ye3x3; - constexpr cxsmpl mdl_I31x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I31x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I31x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I31x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I32x33 = mdl_Rl3x6 * mdl_conjg__Rl3x6; - constexpr cxsmpl mdl_I32x36 = mdl_Rl6x6 * mdl_conjg__Rl3x6; - constexpr cxsmpl mdl_I32x63 = mdl_Rl3x6 * mdl_conjg__Rl6x6; - constexpr cxsmpl mdl_I32x66 = mdl_Rl6x6 * mdl_conjg__Rl6x6; - constexpr cxsmpl mdl_conjg__te3x3 = conj( mdl_te3x3 ); - constexpr cxsmpl mdl_I33x33 = mdl_Rl3x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; - constexpr cxsmpl mdl_I33x36 = mdl_Rl6x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; - constexpr cxsmpl mdl_I33x63 = mdl_Rl3x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; - constexpr cxsmpl mdl_I33x66 = mdl_Rl6x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; - constexpr cxsmpl mdl_I34x33 = mdl_Rl3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I34x36 = mdl_Rl6x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I34x63 = mdl_Rl3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I34x66 = mdl_Rl6x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I35x33 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I35x36 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I35x63 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I35x66 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I36x33 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I36x36 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I36x63 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I36x66 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I37x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I37x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I37x63 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I37x66 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I38x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I38x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I38x63 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I38x66 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I39x33 = mdl_Rl3x3 * mdl_conjg__Rn3x3; - constexpr cxsmpl mdl_I39x36 = mdl_Rl6x3 * mdl_conjg__Rn3x3; - constexpr cxsmpl mdl_I4x33 = mdl_yd3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I4x36 = mdl_yd3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I40x33 = mdl_Rl3x6 * mdl_te3x3 * mdl_conjg__Rn3x3; - constexpr cxsmpl mdl_I40x36 = mdl_Rl6x6 * mdl_te3x3 * mdl_conjg__Rn3x3; - constexpr cxsmpl mdl_I41x33 = mdl_Rl3x3 * mdl_ye3x3 * mdl_conjg__Rn3x3 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I41x36 = mdl_Rl6x3 * mdl_ye3x3 * mdl_conjg__Rn3x3 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I42x33 = mdl_Rl3x6 * mdl_ye3x3 * mdl_conjg__Rn3x3; - constexpr cxsmpl mdl_I42x36 = mdl_Rl6x6 * mdl_ye3x3 * mdl_conjg__Rn3x3; - constexpr cxsmpl mdl_I44x33 = mdl_Rn3x3 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I45x33 = mdl_Rn3x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I45x36 = mdl_Rn3x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I46x33 = mdl_Rn3x3 * mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I46x36 = mdl_Rn3x3 * mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I47x33 = mdl_Rn3x3 * mdl_conjg__Rl3x6 * mdl_conjg__te3x3; - constexpr cxsmpl mdl_I47x36 = mdl_Rn3x3 * mdl_conjg__Rl6x6 * mdl_conjg__te3x3; - constexpr cxsmpl mdl_I48x33 = mdl_Rn3x3 * mdl_ye3x3 * mdl_conjg__Rl3x3 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I48x36 = mdl_Rn3x3 * mdl_ye3x3 * mdl_conjg__Rl6x3 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I49x33 = mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I49x36 = mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I5x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I5x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I5x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I5x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_conjg__Ru3x3 = conj( mdl_Ru3x3 ); - constexpr cxsmpl mdl_I50x33 = mdl_yu3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_conjg__Ru6x3 = conj( mdl_Ru6x3 ); - constexpr cxsmpl mdl_I50x36 = mdl_yu3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I51x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I51x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I51x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I51x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I52x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; - constexpr cxsmpl mdl_I52x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; - constexpr cxsmpl mdl_I52x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; - constexpr cxsmpl mdl_I52x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; - constexpr cxsmpl mdl_I53x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I53x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I53x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I53x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_conjg__tu3x3 = conj( mdl_tu3x3 ); - constexpr cxsmpl mdl_I54x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; - constexpr cxsmpl mdl_I54x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; - constexpr cxsmpl mdl_I54x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; - constexpr cxsmpl mdl_I54x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; - constexpr cxsmpl mdl_I55x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I55x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I55x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I55x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I56x33 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I56x36 = mdl_Rd3x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I56x63 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I56x66 = mdl_Rd6x6 * mdl_td3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I57x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I57x36 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I57x63 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I57x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I58x33 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I58x36 = mdl_Rd3x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I58x63 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I58x66 = mdl_Rd6x3 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I59x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I59x36 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I59x63 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I59x66 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I6x33 = mdl_Rd3x6 * mdl_conjg__Rd3x6; - constexpr cxsmpl mdl_I6x36 = mdl_Rd6x6 * mdl_conjg__Rd3x6; - constexpr cxsmpl mdl_I6x63 = mdl_Rd3x6 * mdl_conjg__Rd6x6; - constexpr cxsmpl mdl_I6x66 = mdl_Rd6x6 * mdl_conjg__Rd6x6; - constexpr cxsmpl mdl_I60x33 = mdl_Rd3x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I60x36 = mdl_Rd3x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I60x63 = mdl_Rd6x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I60x66 = mdl_Rd6x3 * mdl_yu3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I61x33 = mdl_Ru3x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I61x36 = mdl_Ru6x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I62x33 = mdl_Ru3x6 * mdl_yu3x3; - constexpr cxsmpl mdl_I62x36 = mdl_Ru6x6 * mdl_yu3x3; - constexpr cxsmpl mdl_I63x33 = mdl_CKM3x3 * mdl_Ru3x3; - constexpr cxsmpl mdl_I63x36 = mdl_CKM3x3 * mdl_Ru6x3; - constexpr cxsmpl mdl_I64x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I64x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I65x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3; - constexpr cxsmpl mdl_I65x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3; - constexpr cxsmpl mdl_I66x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I66x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I66x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I66x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I67x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I67x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I67x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I67x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I68x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; - constexpr cxsmpl mdl_I68x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x6 * mdl_conjg__td3x3; - constexpr cxsmpl mdl_I68x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; - constexpr cxsmpl mdl_I68x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x6 * mdl_conjg__td3x3; - constexpr cxsmpl mdl_I69x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I69x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I69x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I69x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I7x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3; - constexpr cxsmpl mdl_I7x36 = mdl_Rd6x3 * mdl_conjg__CKM3x3; - constexpr cxsmpl mdl_I70x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I70x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yd3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I70x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I70x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yd3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I71x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I71x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Rd3x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I71x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I71x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Rd6x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I72x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I72x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I72x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I72x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I73x33 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I73x36 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I73x63 = mdl_CKM3x3 * mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I73x66 = mdl_CKM3x3 * mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I74x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I74x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I74x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I74x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I75x33 = mdl_Ru3x6 * mdl_conjg__Ru3x6; - constexpr cxsmpl mdl_I75x36 = mdl_Ru6x6 * mdl_conjg__Ru3x6; - constexpr cxsmpl mdl_I75x63 = mdl_Ru3x6 * mdl_conjg__Ru6x6; - constexpr cxsmpl mdl_I75x66 = mdl_Ru6x6 * mdl_conjg__Ru6x6; - constexpr cxsmpl mdl_I76x33 = mdl_Ru3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I76x36 = mdl_Ru6x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I76x63 = mdl_Ru3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I76x66 = mdl_Ru6x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I77x33 = mdl_Ru3x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; - constexpr cxsmpl mdl_I77x36 = mdl_Ru6x3 * mdl_conjg__Ru3x6 * mdl_conjg__tu3x3; - constexpr cxsmpl mdl_I77x63 = mdl_Ru3x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; - constexpr cxsmpl mdl_I77x66 = mdl_Ru6x3 * mdl_conjg__Ru6x6 * mdl_conjg__tu3x3; - constexpr cxsmpl mdl_I78x33 = mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I78x36 = mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I78x63 = mdl_Ru3x6 * mdl_tu3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I78x66 = mdl_Ru6x6 * mdl_tu3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I79x33 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I79x36 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I79x63 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I79x66 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I8x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I8x36 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I80x33 = mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I80x36 = mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Ru3x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I80x63 = mdl_Ru3x3 * mdl_yu3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I80x66 = mdl_Ru6x3 * mdl_yu3x3 * mdl_conjg__Ru6x3 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I81x33 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I81x36 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I81x63 = mdl_Ru3x6 * mdl_yu3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I81x66 = mdl_Ru6x6 * mdl_yu3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I82x33 = mdl_CKM3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I82x36 = mdl_CKM3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I83x33 = mdl_CKM3x3 * mdl_conjg__Rd3x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I83x36 = mdl_CKM3x3 * mdl_conjg__Rd6x6 * mdl_conjg__yd3x3; - constexpr cxsmpl mdl_I84x33 = mdl_CKM3x3 * mdl_yu3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I84x36 = mdl_CKM3x3 * mdl_yu3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I85x33 = mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I85x36 = mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I86x33 = mdl_conjg__Rl3x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I86x36 = mdl_conjg__Rl6x6 * mdl_conjg__ye3x3; - constexpr cxsmpl mdl_I88x33 = mdl_ye3x3 * mdl_conjg__Rn3x3; - constexpr cxsmpl mdl_I89x33 = mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I89x36 = mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I9x33 = mdl_Rd3x6 * mdl_yd3x3 * mdl_conjg__CKM3x3; - constexpr cxsmpl mdl_I9x36 = mdl_Rd6x6 * mdl_yd3x3 * mdl_conjg__CKM3x3; - constexpr cxsmpl mdl_I90x33 = mdl_conjg__CKM3x3 * mdl_conjg__Ru3x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I90x36 = mdl_conjg__CKM3x3 * mdl_conjg__Ru6x6 * mdl_conjg__yu3x3; - constexpr cxsmpl mdl_I91x33 = mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I91x36 = mdl_yd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I92x33 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I92x36 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I92x63 = mdl_CKM3x3 * mdl_Ru3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I92x66 = mdl_CKM3x3 * mdl_Ru6x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I93x33 = mdl_Rn3x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I93x36 = mdl_Rn3x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I94x33 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I94x36 = mdl_Rd3x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I94x63 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I94x66 = mdl_Rd6x3 * mdl_conjg__CKM3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I95x33 = mdl_Rl3x3 * mdl_conjg__Rn3x3; - constexpr cxsmpl mdl_I95x36 = mdl_Rl6x3 * mdl_conjg__Rn3x3; - constexpr cxsmpl mdl_I96x33 = mdl_Rd3x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I96x36 = mdl_Rd6x3 * mdl_conjg__Rd3x3; - constexpr cxsmpl mdl_I96x63 = mdl_Rd3x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I96x66 = mdl_Rd6x3 * mdl_conjg__Rd6x3; - constexpr cxsmpl mdl_I97x33 = mdl_Rl3x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I97x36 = mdl_Rl6x3 * mdl_conjg__Rl3x3; - constexpr cxsmpl mdl_I97x63 = mdl_Rl3x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I97x66 = mdl_Rl6x3 * mdl_conjg__Rl6x3; - constexpr cxsmpl mdl_I98x33 = mdl_Ru3x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I98x36 = mdl_Ru6x3 * mdl_conjg__Ru3x3; - constexpr cxsmpl mdl_I98x63 = mdl_Ru3x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I98x66 = mdl_Ru6x3 * mdl_conjg__Ru6x3; - constexpr cxsmpl mdl_I99x33 = mdl_ye3x3; - constexpr cxsmpl mdl_complexi = cxsmpl( 0., 1. ); - constexpr double mdl_sqrt__2 = constexpr_sqrt( 2. ); - constexpr double mdl_sw__exp__2 = ( ( mdl_sw ) * ( mdl_sw ) ); - constexpr cxsmpl mdl_conjg__NN1x1 = conj( mdl_NN1x1 ); - constexpr cxsmpl mdl_conjg__NN1x2 = conj( mdl_NN1x2 ); - constexpr cxsmpl mdl_conjg__NN1x3 = conj( mdl_NN1x3 ); - constexpr cxsmpl mdl_conjg__NN1x4 = conj( mdl_NN1x4 ); - constexpr cxsmpl mdl_conjg__NN2x1 = conj( mdl_NN2x1 ); - constexpr cxsmpl mdl_conjg__NN2x2 = conj( mdl_NN2x2 ); - constexpr cxsmpl mdl_conjg__NN2x3 = conj( mdl_NN2x3 ); - constexpr cxsmpl mdl_conjg__NN2x4 = conj( mdl_NN2x4 ); - constexpr cxsmpl mdl_conjg__NN3x1 = conj( mdl_NN3x1 ); - constexpr cxsmpl mdl_conjg__NN3x2 = conj( mdl_NN3x2 ); - constexpr cxsmpl mdl_conjg__NN3x3 = conj( mdl_NN3x3 ); - constexpr cxsmpl mdl_conjg__NN3x4 = conj( mdl_NN3x4 ); - constexpr cxsmpl mdl_conjg__NN4x1 = conj( mdl_NN4x1 ); - constexpr cxsmpl mdl_conjg__NN4x2 = conj( mdl_NN4x2 ); - constexpr cxsmpl mdl_conjg__NN4x3 = conj( mdl_NN4x3 ); - constexpr cxsmpl mdl_conjg__NN4x4 = conj( mdl_NN4x4 ); - constexpr cxsmpl mdl_conjg__UU1x1 = conj( mdl_UU1x1 ); - constexpr cxsmpl mdl_conjg__UU1x2 = conj( mdl_UU1x2 ); - constexpr cxsmpl mdl_conjg__UU2x1 = conj( mdl_UU2x1 ); - constexpr cxsmpl mdl_conjg__UU2x2 = conj( mdl_UU2x2 ); - constexpr cxsmpl mdl_conjg__VV1x1 = conj( mdl_VV1x1 ); - constexpr cxsmpl mdl_conjg__VV1x2 = conj( mdl_VV1x2 ); - constexpr cxsmpl mdl_conjg__VV2x1 = conj( mdl_VV2x1 ); - constexpr cxsmpl mdl_conjg__VV2x2 = conj( mdl_VV2x2 ); - constexpr double mdl_cos__alp = cos( mdl_alp ); - constexpr double mdl_sin__alp = sin( mdl_alp ); - constexpr cxsmpl mdl_conjg__MUH = conj( mdl_MUH ); - constexpr double mdl_ee = 2. * constexpr_sqrt( 1. / aEWM1 ) * constexpr_sqrt( M_PI ); - constexpr double mdl_gp = mdl_ee / mdl_cw; - constexpr double mdl_gw = mdl_ee / mdl_sw; - constexpr double mdl_vev = ( 2. * mdl_cw * mdl_MZ * mdl_sw ) / mdl_ee; - constexpr double mdl_vd = mdl_vev * mdl_cos__beta; - constexpr double mdl_vu = mdl_vev * mdl_sin__beta; - constexpr double mdl_ee__exp__2 = ( ( mdl_ee ) * ( mdl_ee ) ); - - // Fixes for Majorana particles - constexpr int mdl_Wneu2_sign = ( mdl_Mneu2 < 0 ? -1 : + 1 ); - constexpr int mdl_Wneu3_sign = ( mdl_Mneu3 < 0 ? -1 : + 1 ); - constexpr int mdl_Wneu4_sign = ( mdl_Mneu4 < 0 ? -1 : + 1 ); - constexpr int mdl_Wgo_sign = ( mdl_Mgo < 0 ? -1 : + 1 ); - constexpr double mdl_Wneu2 = mdl_Wneu2_sign * mdl_Wneu2_abs; - constexpr double mdl_Wneu3 = mdl_Wneu3_sign * mdl_Wneu3_abs; - constexpr double mdl_Wneu4 = mdl_Wneu4_sign * mdl_Wneu4_abs; - constexpr double mdl_Wgo = mdl_Wgo_sign * mdl_Wgo_abs; - - // Model couplings independent of aS - // (none) - - // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) - //constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) - - // Model couplings dependent on aS - //constexpr cxsmpl GC_6 = -G; // now computed event-by-event (running alphas #373) - //constexpr cxsmpl GC_51 = -( mdl_complexi * G * mdl_I51x11 ); // now computed event-by-event (running alphas #373) - - // Print parameters that are unchanged during the run - void printIndependentParameters(); - - // Print couplings that are unchanged during the run - void printIndependentCouplings(); - - // Print parameters that are changed event by event - //void printDependentParameters(); // now computed event-by-event (running alphas #373) - - // Print couplings that are changed event by event - //void printDependentCouplings(); // now computed event-by-event (running alphas #373) -} +} // end namespace mg5amcGpu/mg5amcCpu #endif //========================================================================== -namespace Parameters_MSSM_SLHA2_dependentCouplings +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { - constexpr size_t ndcoup = 2; // #couplings that vary event by event because they depend on the running alphas QCD - constexpr size_t idcoup_GC_6 = 0; - constexpr size_t idcoup_GC_51 = 1; - struct DependentCouplings_sv + namespace Parameters_MSSM_SLHA2_dependentCouplings { - cxtype_sv GC_6; - cxtype_sv GC_51; - }; + constexpr size_t ndcoup = 2; // #couplings that vary event by event because they depend on the running alphas QCD + constexpr size_t idcoup_GC_6 = 0; + constexpr size_t idcoup_GC_51 = 1; + struct DependentCouplings_sv + { + cxtype_sv GC_6; + cxtype_sv GC_51; + }; #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) - { + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + { #ifdef MGONGPU_HARDCODE_PARAM - using namespace Parameters_MSSM_SLHA2; + using namespace Parameters_MSSM_SLHA2; #endif - // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_MSSM_SLHA2) because: - // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below - const cxtype cI( 0., 1. ); - DependentCouplings_sv out; - // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) + // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_MSSM_SLHA2) because: + // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below + const cxtype cI( 0., 1. ); + DependentCouplings_sv out; + // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) #if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT ) - { - const fptype_sv& G = G_sv; - // Model parameters dependent on aS - //const fptype_sv mdl_sqrt__aS = constexpr_sqrt( aS ); - //const fptype_sv G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); - // Model couplings dependent on aS - out.GC_6 = -G; - out.GC_51 = -( cI * G * mdl_I51x11 ); - } + { + const fptype_sv& G = G_sv; + // Model parameters dependent on aS + //const fptype_sv mdl_sqrt__aS = constexpr_sqrt( aS ); + //const fptype_sv G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); + constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); + // Model couplings dependent on aS + out.GC_6 = -G; + out.GC_51 = -( cI * G * mdl_I51x11 ); + } #else - // ** NB #439: special handling is necessary ONLY FOR VECTORS OF FLOATS (variable Gs are vector floats, fixed parameters are scalar doubles) - // Use an explicit loop to avoid <> - // Problems may come e.g. in EFTs from multiplying a vector float (related to aS-dependent G) by a scalar double (aS-independent parameters) - fptype_v GC_6r_v; - fptype_v GC_6i_v; - fptype_v GC_51r_v; - fptype_v GC_51i_v; - for( int i = 0; i < neppV; i++ ) - { - const fptype& G = G_sv[i]; - // Model parameters dependent on aS - //const fptype mdl_sqrt__aS = constexpr_sqrt( aS ); - //const fptype G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); - // Model couplings dependent on aS - const cxtype GC_6 = -G; - const cxtype GC_51 = -( cI * G * mdl_I51x11 ); - GC_6r_v[i] = cxreal( GC_6 ); - GC_6i_v[i] = cximag( GC_6 ); - GC_51r_v[i] = cxreal( GC_51 ); - GC_51i_v[i] = cximag( GC_51 ); - } - out.GC_6 = cxtype_v( GC_6r_v, GC_6i_v ); - out.GC_51 = cxtype_v( GC_51r_v, GC_51i_v ); + // ** NB #439: special handling is necessary ONLY FOR VECTORS OF FLOATS (variable Gs are vector floats, fixed parameters are scalar doubles) + // Use an explicit loop to avoid <> + // Problems may come e.g. in EFTs from multiplying a vector float (related to aS-dependent G) by a scalar double (aS-independent parameters) + fptype_v GC_6r_v; + fptype_v GC_6i_v; + fptype_v GC_51r_v; + fptype_v GC_51i_v; + for( int i = 0; i < neppV; i++ ) + { + const fptype& G = G_sv[i]; + // Model parameters dependent on aS + //const fptype mdl_sqrt__aS = constexpr_sqrt( aS ); + //const fptype G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); + constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); + // Model couplings dependent on aS + const cxtype GC_6 = -G; + const cxtype GC_51 = -( cI * G * mdl_I51x11 ); + GC_6r_v[i] = cxreal( GC_6 ); + GC_6i_v[i] = cximag( GC_6 ); + GC_51r_v[i] = cxreal( GC_51 ); + GC_51i_v[i] = cximag( GC_51 ); + } + out.GC_6 = cxtype_v( GC_6r_v, GC_6i_v ); + out.GC_51 = cxtype_v( GC_51r_v, GC_51i_v ); #endif - // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) - return out; - } -#ifdef __CUDACC__ + // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) + return out; + } +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif -} + } -//========================================================================== + //========================================================================== -namespace Parameters_MSSM_SLHA2_independentCouplings -{ - constexpr size_t nicoup = 0; // #couplings that are fixed for all events because they do not depend on the running alphas QCD - // NB: there are no aS-independent couplings in this physics process -} + namespace Parameters_MSSM_SLHA2_independentCouplings + { + constexpr size_t nicoup = 0; // #couplings that are fixed for all events because they do not depend on the running alphas QCD + // NB: there are no aS-independent couplings in this physics process + } -//========================================================================== + //========================================================================== -#ifdef __CUDACC__ -namespace mg5amcGpu -#else -namespace mg5amcCpu -#endif -{ #pragma GCC diagnostic push #ifndef __clang__ #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> @@ -885,7 +908,8 @@ namespace mg5amcCpu return; } #pragma GCC diagnostic pop -} + +} // end namespace mg5amcGpu/mg5amcCpu //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk index 4dbc05afe1..79919d1753 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/src/cudacpp_src.mk @@ -1,3 +1,8 @@ +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: assume that the same name (e.g. cudacpp.mk, Makefile...) is used in the Subprocess and src directories @@ -14,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -31,9 +36,22 @@ endif # See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html ###RANLIB = ranlib +# Add -mmacosx-version-min=11.3 to avoid "ld: warning: object file was built for newer macOS version than being linked" +LDFLAGS = +ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) +CXXFLAGS += -mmacosx-version-min=11.3 +LDFLAGS += -mmacosx-version-min=11.3 +endif + #------------------------------------------------------------------------------- -#=== Configure ccache for CUDA and C++ builds +#=== Configure the CUDA compiler (note: GPUCC have been exported from cudacpp.mk including ccache) + +###$(info GPUCC=$(GPUCC)) + +#------------------------------------------------------------------------------- + +#=== Configure ccache for C++ builds (note: GPUCC have been exported from cudacpp.mk including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -42,11 +60,6 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -#ifneq ($(NVCC),) -# ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) -# override NVCC:=ccache $(NVCC) -# endif -#endif #------------------------------------------------------------------------------- @@ -73,12 +86,20 @@ endif #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD (exported from cudacpp.mk) +#=== (NB the RNDCXXFLAGS and RNDLIBFLAGS appropriate to user-defined choices of HASCURAND and HASHIPRAND have been exported from cudacpp.mk) # Set the build flags appropriate to OMPFLAGS ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -110,7 +131,9 @@ else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported) endif else - ifeq ($(AVX),sse4) + ifeq ($(AVX),none) + override AVXFLAGS = -march=x86-64 # no SIMD (see #588) + else ifeq ($(AVX),sse4) override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers) else ifeq ($(AVX),avx2) override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang] @@ -153,14 +176,6 @@ else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -###$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - CXXFLAGS += -DMGONGPU_HAS_NO_CURAND -else ifneq ($(RNDGEN),hasCurand) - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - #------------------------------------------------------------------------------- #=== Configure build directories and build lockfiles === @@ -171,7 +186,7 @@ override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) # Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) # (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) +override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(HASCURAND)_$(HASHIPRAND) # Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 ###$(info Current directory is $(shell pwd)) @@ -211,7 +226,7 @@ MG5AMC_COMMONLIB = mg5amc_common all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so # Target (and build options): debug -debug: OPTFLAGS = -g -O0 -DDEBUG2 +debug: OPTFLAGS = -g -O0 debug: all.$(TAG) # Target: tag-specific build lockfiles @@ -231,18 +246,32 @@ $(LIBDIR)/.build.$(TAG): #------------------------------------------------------------------------------- # Generic target and build rules: objects from C++ compilation -$(BUILDDIR)/%.o : %.cc *.h - @if [ ! -d $(BUILDDIR) ]; then mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ +$(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ + +# Generic target and build rules: objects from CUDA compilation +$(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) + @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2.o read_slha.o) +ifneq ($(GPUCC),) +cu_objects=$(addprefix $(BUILDDIR)/, Parameters_MSSM_SLHA2_cu.o) +endif # Target (and build rules): common (src) library +ifneq ($(GPUCC),) +$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) + @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) +else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(CXX) -shared -o$@ $(cxx_objects) + $(CXX) -shared -o $@ $(cxx_objects) $(LDFLAGS) +endif #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h index 6f939b6d4f..22d6921fda 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuConfig.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -5,16 +10,53 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + +// Make sure that __HIP_PLATFORM_NVIDIA__ is undefined +// (__HIP_PLATFORM_AMD__ is defined by hipcc or in HiprandRandomNumberKernel.cc) +#undef __HIP_PLATFORM_NVIDIA__ // disable hiprand for NVidia (curand) + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -#ifdef __CUDACC__ -#undef MGONGPU_HAS_NO_CURAND +// For HIP, by default, do not allow curand to be used (hiprand or common random numbers will be used instead) +// For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 #else +//#ifdef __CUDACC__ +//#undef MGONGPU_HAS_NO_CURAND // default +////#define MGONGPU_HAS_NO_CURAND 1 +//#else //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 +//#endif +#endif + +// Choose if hiprand is supported for generating random numbers +// For CUDA, by default, do not allow hiprand to be used (curand or common random numbers will be used instead) +// For both HIP and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_HIPRAND +// (there may exist HIP installations which do not include hiprand?) +#if defined __CUDACC__ +#define MGONGPU_HAS_NO_HIPRAND 1 +#else +//#ifdef __HIPCC__ +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#else +//#undef MGONGPU_HAS_NO_HIPRAND // default +////#define MGONGPU_HAS_NO_HIPRAND 1 +//#endif #endif // Choose floating point precision (for everything but color algebra #537) @@ -47,23 +89,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -79,20 +129,25 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif +// NB: namespace mgOnGpu includes types which are defined in exactly the same way for CPU and GPU builds (see #318 and #725) namespace mgOnGpu { @@ -112,22 +167,6 @@ namespace mgOnGpu typedef float fptype2; // single precision (4 bytes, fp32) #endif - // --- Physics process-specific constants that are best declared at compile time - - const int np4 = 4; // dimensions of 4-momenta (E,px,py,pz) - - const int npari = 2; // #particles in the initial state (incoming): e.g. 2 (e+ e-) for e+ e- -> mu+ mu- - - const int nparf = 2; // #particles in the final state (outgoing): e.g. 2 (mu+ mu-) for e+ e- -> mu+ mu- - - const int npar = npari + nparf; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- - - const int ncomb = 16; // #helicity combinations: e.g. 16 for e+ e- -> mu+ mu- (2**4 = fermion spin up/down ** npar) - - const int nw6 = 6; // dimensions of each wavefunction (HELAS KEK 91-11): e.g. 6 for e+ e- -> mu+ mu- (fermions and vectors) - - const int nwf = 5; // #wavefunctions = #external (npar) + #internal: e.g. 5 for e+ e- -> mu+ mu- (1 internal is gamma or Z) - // --- Platform-specific software implementation details // Maximum number of blocks per grid @@ -142,7 +181,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -153,7 +192,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -183,9 +222,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -197,8 +236,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h index 9b26c48b79..7ede1dbfae 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -14,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -25,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -38,8 +48,12 @@ // COMPLEX TYPES: SIMPLE COMPLEX CLASS (cxsmpl) //========================================================================== +// NB: namespace mgOnGpu includes types which are defined in exactly the same way for CPU and GPU builds (see #318 and #725) namespace mgOnGpu /* clang-format off */ { + // The number of floating point types in a complex type (real, imaginary) + constexpr int nx2 = 2; + // --- Type definition (simple complex type derived from cxtype_v) template class cxsmpl @@ -62,8 +76,7 @@ namespace mgOnGpu /* clang-format off */ }; template - constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") - inline __host__ __device__ cxsmpl + inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); @@ -74,130 +87,147 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -template -inline __host__ __device__ std::ostream& -operator<<( std::ostream& out, const cxsmpl& c ) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { - out << std::complex( c.real(), c.imag() ); - return out; -} + template + inline __host__ std::ostream& + operator<<( std::ostream& out, const cxsmpl& c ) + { + out << std::complex( c.real(), c.imag() ); + return out; + } -// Operators for cxsmpl -template -inline __host__ __device__ constexpr cxsmpl -operator+( const cxsmpl a ) -{ - return a; -} + // Operators for cxsmpl + template + inline __host__ __device__ constexpr cxsmpl + operator+( const cxsmpl a ) + { + return a; + } -template -inline __host__ __device__ constexpr cxsmpl -operator-( const cxsmpl& a ) -{ - return cxsmpl( -a.real(), -a.imag() ); -} + template + inline __host__ __device__ constexpr cxsmpl + operator-( const cxsmpl& a ) + { + return cxsmpl( -a.real(), -a.imag() ); + } -template -inline __host__ __device__ constexpr cxsmpl -operator+( const cxsmpl& a, const cxsmpl& b ) -{ - return cxsmpl( a.real() + b.real(), a.imag() + b.imag() ); -} + template + inline __host__ __device__ constexpr cxsmpl + operator+( const cxsmpl& a, const cxsmpl& b ) + { + return cxsmpl( a.real() + b.real(), a.imag() + b.imag() ); + } -template -inline __host__ __device__ constexpr cxsmpl -operator+( const FP& a, const cxsmpl& b ) -{ - return cxsmpl( a, 0 ) + b; -} + template + inline __host__ __device__ constexpr cxsmpl + operator+( const FP& a, const cxsmpl& b ) + { + return cxsmpl( a, 0 ) + b; + } -template -inline __host__ __device__ constexpr cxsmpl -operator-( const cxsmpl& a, const cxsmpl& b ) -{ - return cxsmpl( a.real() - b.real(), a.imag() - b.imag() ); -} + template + inline __host__ __device__ constexpr cxsmpl + operator-( const cxsmpl& a, const cxsmpl& b ) + { + return cxsmpl( a.real() - b.real(), a.imag() - b.imag() ); + } -template -inline __host__ __device__ constexpr cxsmpl -operator-( const FP& a, const cxsmpl& b ) -{ - return cxsmpl( a, 0 ) - b; -} + template + inline __host__ __device__ constexpr cxsmpl + operator-( const FP& a, const cxsmpl& b ) + { + return cxsmpl( a, 0 ) - b; + } -template -inline __host__ __device__ constexpr cxsmpl -operator*( const cxsmpl& a, const cxsmpl& b ) -{ - return cxsmpl( a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag() ); -} + template + inline __host__ __device__ constexpr cxsmpl + operator*( const cxsmpl& a, const cxsmpl& b ) + { + return cxsmpl( a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag() ); + } -template -inline __host__ __device__ constexpr cxsmpl -operator*( const FP& a, const cxsmpl& b ) -{ - return cxsmpl( a, 0 ) * b; -} + template + inline __host__ __device__ constexpr cxsmpl + operator*( const FP& a, const cxsmpl& b ) + { + return cxsmpl( a, 0 ) * b; + } -inline __host__ __device__ constexpr cxsmpl -operator*( const double& a, const cxsmpl& b ) -{ - return cxsmpl( a, 0 ) * b; -} + inline __host__ __device__ constexpr cxsmpl + operator*( const double& a, const cxsmpl& b ) + { + return cxsmpl( a, 0 ) * b; + } -template -inline __host__ __device__ constexpr cxsmpl -operator/( const cxsmpl& a, const cxsmpl& b ) -{ - FP bnorm = b.real() * b.real() + b.imag() * b.imag(); - return cxsmpl( ( a.real() * b.real() + a.imag() * b.imag() ) / bnorm, - ( a.imag() * b.real() - a.real() * b.imag() ) / bnorm ); -} + inline __host__ __device__ constexpr cxsmpl + operator*( const cxsmpl& a, const double& b ) + { + return a * cxsmpl( b, 0 ); + } -template -inline __host__ __device__ constexpr cxsmpl -operator/( const FP& a, const cxsmpl& b ) -{ - return cxsmpl( a, 0 ) / b; -} + template + inline __host__ __device__ constexpr cxsmpl + operator/( const cxsmpl& a, const cxsmpl& b ) + { + FP bnorm = b.real() * b.real() + b.imag() * b.imag(); + return cxsmpl( ( a.real() * b.real() + a.imag() * b.imag() ) / bnorm, + ( a.imag() * b.real() - a.real() * b.imag() ) / bnorm ); + } -template -inline __host__ __device__ constexpr cxsmpl -operator+( const cxsmpl& a, const FP& b ) -{ - return a + cxsmpl( b, 0 ); -} + template + inline __host__ __device__ constexpr cxsmpl + operator/( const FP& a, const cxsmpl& b ) + { + return cxsmpl( a, 0 ) / b; + } -template -inline __host__ __device__ constexpr cxsmpl -operator-( const cxsmpl& a, const FP& b ) -{ - return a - cxsmpl( b, 0 ); -} + template + inline __host__ __device__ constexpr cxsmpl + operator+( const cxsmpl& a, const FP& b ) + { + return a + cxsmpl( b, 0 ); + } -template -inline __host__ __device__ constexpr cxsmpl -operator*( const cxsmpl& a, const FP& b ) -{ - return a * cxsmpl( b, 0 ); -} + template + inline __host__ __device__ constexpr cxsmpl + operator-( const cxsmpl& a, const FP& b ) + { + return a - cxsmpl( b, 0 ); + } -template -inline __host__ __device__ constexpr cxsmpl -operator/( const cxsmpl& a, const FP& b ) -{ - return a / cxsmpl( b, 0 ); + template + inline __host__ __device__ constexpr cxsmpl + operator*( const cxsmpl& a, const FP& b ) + { + return a * cxsmpl( b, 0 ); + } + + template + inline __host__ __device__ constexpr cxsmpl + operator/( const cxsmpl& a, const FP& b ) + { + return a / cxsmpl( b, 0 ); + } } //========================================================================== // COMPLEX TYPES: (PLATFORM-SPECIFIC) TYPEDEFS //========================================================================== -namespace mgOnGpu +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { - // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -217,390 +247,402 @@ namespace mgOnGpu #endif #endif - // The number of floating point types in a complex type (real, imaginary) - constexpr int nx2 = 2; - // SANITY CHECK: memory access may be based on casts of fptype[2] to cxtype (e.g. for wavefunctions) - static_assert( sizeof( cxtype ) == nx2 * sizeof( fptype ), "sizeof(cxtype) is not 2*sizeof(fptype)" ); + static_assert( sizeof( cxtype ) == mgOnGpu::nx2 * sizeof( fptype ), "sizeof(cxtype) is not 2*sizeof(fptype)" ); } -// Expose typedefs and operators outside the namespace -using mgOnGpu::cxtype; +// DANGEROUS! this was mixing different cxtype definitions for CPU and GPU builds (see #318 and #725) +// DO NOT expose typedefs and operators outside the namespace +//using mgOnGpu::cxtype; //========================================================================== // COMPLEX TYPES: (PLATFORM-SPECIFIC) FUNCTIONS AND OPERATORS //========================================================================== -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL -//------------------------------ -// CUDA or C++ - using cxsmpl -//------------------------------ + //------------------------------ + // CUDA or C++ - using cxsmpl + //------------------------------ -inline __host__ __device__ cxtype -cxmake( const fptype& r, const fptype& i ) -{ - return cxtype( r, i ); // cxsmpl constructor -} + inline __host__ __device__ cxtype + cxmake( const fptype& r, const fptype& i ) + { + return cxtype( r, i ); // cxsmpl constructor + } -inline __host__ __device__ fptype -cxreal( const cxtype& c ) -{ - return c.real(); // cxsmpl::real() -} + inline __host__ __device__ fptype + cxreal( const cxtype& c ) + { + return c.real(); // cxsmpl::real() + } -inline __host__ __device__ fptype -cximag( const cxtype& c ) -{ - return c.imag(); // cxsmpl::imag() -} + inline __host__ __device__ fptype + cximag( const cxtype& c ) + { + return c.imag(); // cxsmpl::imag() + } -inline __host__ __device__ cxtype -cxconj( const cxtype& c ) -{ - return conj( c ); // conj( cxsmpl ) -} + inline __host__ __device__ cxtype + cxconj( const cxtype& c ) + { + return conj( c ); // conj( cxsmpl ) + } -inline __host__ cxtype // NOT __device__ -cxmake( const std::complex& c ) // std::complex to cxsmpl (float-to-float or float-to-double) -{ - return cxmake( c.real(), c.imag() ); -} + inline __host__ cxtype // NOT __device__ + cxmake( const std::complex& c ) // std::complex to cxsmpl (float-to-float or float-to-double) + { + return cxmake( c.real(), c.imag() ); + } -inline __host__ cxtype // NOT __device__ -cxmake( const std::complex& c ) // std::complex to cxsmpl (double-to-float or double-to-double) -{ - return cxmake( c.real(), c.imag() ); -} + inline __host__ cxtype // NOT __device__ + cxmake( const std::complex& c ) // std::complex to cxsmpl (double-to-float or double-to-double) + { + return cxmake( c.real(), c.imag() ); + } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL -//========================================================================== + //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) -//------------------------------ -// CUDA - using thrust::complex -//------------------------------ + //------------------------------ + // CUDA - using thrust::complex + //------------------------------ -inline __host__ __device__ cxtype -cxmake( const fptype& r, const fptype& i ) -{ - return cxtype( r, i ); // thrust::complex constructor -} + inline __host__ __device__ cxtype + cxmake( const fptype& r, const fptype& i ) + { + return cxtype( r, i ); // thrust::complex constructor + } -inline __host__ __device__ fptype -cxreal( const cxtype& c ) -{ - return c.real(); // thrust::complex::real() -} + inline __host__ __device__ fptype + cxreal( const cxtype& c ) + { + return c.real(); // thrust::complex::real() + } -inline __host__ __device__ fptype -cximag( const cxtype& c ) -{ - return c.imag(); // thrust::complex::imag() -} + inline __host__ __device__ fptype + cximag( const cxtype& c ) + { + return c.imag(); // thrust::complex::imag() + } -inline __host__ __device__ cxtype -cxconj( const cxtype& c ) -{ - return conj( c ); // conj( thrust::complex ) -} + inline __host__ __device__ cxtype + cxconj( const cxtype& c ) + { + return conj( c ); // conj( thrust::complex ) + } -inline __host__ __device__ const cxtype& -cxmake( const cxtype& c ) -{ - return c; -} + inline __host__ __device__ const cxtype& + cxmake( const cxtype& c ) + { + return c; + } #endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST -//========================================================================== + //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) -//------------------------------ -// CUDA - using cuComplex -//------------------------------ + //------------------------------ + // CUDA - using cuComplex + //------------------------------ #if defined MGONGPU_FPTYPE_DOUBLE // cuda + cucomplex + double -//+++++++++++++++++++++++++ -// cuDoubleComplex ONLY -//+++++++++++++++++++++++++ + //+++++++++++++++++++++++++ + // cuDoubleComplex ONLY + //+++++++++++++++++++++++++ -inline __host__ __device__ cxtype -cxmake( const fptype& r, const fptype& i ) -{ - return make_cuDoubleComplex( r, i ); -} + inline __host__ __device__ cxtype + cxmake( const fptype& r, const fptype& i ) + { + return make_cuDoubleComplex( r, i ); + } -inline __host__ __device__ fptype -cxreal( const cxtype& c ) -{ - return cuCreal( c ); // returns by value -} + inline __host__ __device__ fptype + cxreal( const cxtype& c ) + { + return cuCreal( c ); // returns by value + } -inline __host__ __device__ fptype -cximag( const cxtype& c ) -{ - return cuCimag( c ); // returns by value -} + inline __host__ __device__ fptype + cximag( const cxtype& c ) + { + return cuCimag( c ); // returns by value + } -inline __host__ __device__ cxtype -operator+( const cxtype& a, const cxtype& b ) -{ - return cuCadd( a, b ); -} + inline __host__ __device__ cxtype + operator+( const cxtype& a, const cxtype& b ) + { + return cuCadd( a, b ); + } -inline __host__ __device__ cxtype& -operator+=( cxtype& a, const cxtype& b ) -{ - a = cuCadd( a, b ); - return a; -} + inline __host__ __device__ cxtype& + operator+=( cxtype& a, const cxtype& b ) + { + a = cuCadd( a, b ); + return a; + } -inline __host__ __device__ cxtype -operator-( const cxtype& a, const cxtype& b ) -{ - return cuCsub( a, b ); -} + inline __host__ __device__ cxtype + operator-( const cxtype& a, const cxtype& b ) + { + return cuCsub( a, b ); + } -inline __host__ __device__ cxtype& -operator-=( cxtype& a, const cxtype& b ) -{ - a = cuCsub( a, b ); - return a; -} + inline __host__ __device__ cxtype& + operator-=( cxtype& a, const cxtype& b ) + { + a = cuCsub( a, b ); + return a; + } -inline __host__ __device__ cxtype -operator*( const cxtype& a, const cxtype& b ) -{ - return cuCmul( a, b ); -} + inline __host__ __device__ cxtype + operator*( const cxtype& a, const cxtype& b ) + { + return cuCmul( a, b ); + } -inline __host__ __device__ cxtype -operator/( const cxtype& a, const cxtype& b ) -{ - return cuCdiv( a, b ); -} + inline __host__ __device__ cxtype + operator/( const cxtype& a, const cxtype& b ) + { + return cuCdiv( a, b ); + } #elif defined MGONGPU_FPTYPE_FLOAT // cuda + cucomplex + float -//+++++++++++++++++++++++++ -// cuFloatComplex ONLY -//+++++++++++++++++++++++++ + //+++++++++++++++++++++++++ + // cuFloatComplex ONLY + //+++++++++++++++++++++++++ -inline __host__ __device__ cxtype -cxmake( const fptype& r, const fptype& i ) -{ - return make_cuFloatComplex( r, i ); -} + inline __host__ __device__ cxtype + cxmake( const fptype& r, const fptype& i ) + { + return make_cuFloatComplex( r, i ); + } -inline __host__ __device__ fptype -cxreal( const cxtype& c ) -{ - return cuCrealf( c ); // returns by value -} + inline __host__ __device__ fptype + cxreal( const cxtype& c ) + { + return cuCrealf( c ); // returns by value + } -inline __host__ __device__ fptype -cximag( const cxtype& c ) -{ - return cuCimagf( c ); // returns by value -} + inline __host__ __device__ fptype + cximag( const cxtype& c ) + { + return cuCimagf( c ); // returns by value + } -inline __host__ __device__ cxtype -operator+( const cxtype& a, const cxtype& b ) -{ - return cuCaddf( a, b ); -} + inline __host__ __device__ cxtype + operator+( const cxtype& a, const cxtype& b ) + { + return cuCaddf( a, b ); + } -inline __host__ __device__ cxtype& -operator+=( cxtype& a, const cxtype& b ) -{ - a = cuCaddf( a, b ); - return a; -} + inline __host__ __device__ cxtype& + operator+=( cxtype& a, const cxtype& b ) + { + a = cuCaddf( a, b ); + return a; + } -inline __host__ __device__ cxtype -operator-( const cxtype& a, const cxtype& b ) -{ - return cuCsubf( a, b ); -} + inline __host__ __device__ cxtype + operator-( const cxtype& a, const cxtype& b ) + { + return cuCsubf( a, b ); + } -inline __host__ __device__ cxtype& -operator-=( cxtype& a, const cxtype& b ) -{ - a = cuCsubf( a, b ); - return a; -} + inline __host__ __device__ cxtype& + operator-=( cxtype& a, const cxtype& b ) + { + a = cuCsubf( a, b ); + return a; + } -inline __host__ __device__ cxtype -operator*( const cxtype& a, const cxtype& b ) -{ - return cuCmulf( a, b ); -} + inline __host__ __device__ cxtype + operator*( const cxtype& a, const cxtype& b ) + { + return cuCmulf( a, b ); + } -inline __host__ __device__ cxtype -operator/( const cxtype& a, const cxtype& b ) -{ - return cuCdivf( a, b ); -} + inline __host__ __device__ cxtype + operator/( const cxtype& a, const cxtype& b ) + { + return cuCdivf( a, b ); + } -inline __host__ cxtype // NOT __device__ -cxmake( const std::complex& c ) // std::complex to cucomplex (cast double-to-float) -{ - return cxmake( (fptype)c.real(), (fptype)c.imag() ); -} + inline __host__ cxtype // NOT __device__ + cxmake( const std::complex& c ) // std::complex to cucomplex (cast double-to-float) + { + return cxmake( (fptype)c.real(), (fptype)c.imag() ); + } #endif -//+++++++++++++++++++++++++ -// cuDoubleComplex OR -// cuFloatComplex -//+++++++++++++++++++++++++ + //+++++++++++++++++++++++++ + // cuDoubleComplex OR + // cuFloatComplex + //+++++++++++++++++++++++++ -inline __host__ __device__ cxtype -operator+( const cxtype a ) -{ - return a; -} + inline __host__ __device__ cxtype + operator+( const cxtype a ) + { + return a; + } -inline __host__ __device__ cxtype -operator-( const cxtype& a ) -{ - return cxmake( -cxreal( a ), -cximag( a ) ); -} + inline __host__ __device__ cxtype + operator-( const cxtype& a ) + { + return cxmake( -cxreal( a ), -cximag( a ) ); + } -inline __host__ __device__ cxtype -operator+( const fptype& a, const cxtype& b ) -{ - return cxmake( a, 0 ) + b; -} + inline __host__ __device__ cxtype + operator+( const fptype& a, const cxtype& b ) + { + return cxmake( a, 0 ) + b; + } -inline __host__ __device__ cxtype -operator-( const fptype& a, const cxtype& b ) -{ - return cxmake( a, 0 ) - b; -} + inline __host__ __device__ cxtype + operator-( const fptype& a, const cxtype& b ) + { + return cxmake( a, 0 ) - b; + } -inline __host__ __device__ cxtype -operator*( const fptype& a, const cxtype& b ) -{ - return cxmake( a, 0 ) * b; -} + inline __host__ __device__ cxtype + operator*( const fptype& a, const cxtype& b ) + { + return cxmake( a, 0 ) * b; + } -inline __host__ __device__ cxtype -operator/( const fptype& a, const cxtype& b ) -{ - return cxmake( a, 0 ) / b; -} + inline __host__ __device__ cxtype + operator/( const fptype& a, const cxtype& b ) + { + return cxmake( a, 0 ) / b; + } -inline __host__ __device__ cxtype -operator+( const cxtype& a, const fptype& b ) -{ - return a + cxmake( b, 0 ); -} + inline __host__ __device__ cxtype + operator+( const cxtype& a, const fptype& b ) + { + return a + cxmake( b, 0 ); + } -inline __host__ __device__ cxtype -operator-( const cxtype& a, const fptype& b ) -{ - return a - cxmake( b, 0 ); -} + inline __host__ __device__ cxtype + operator-( const cxtype& a, const fptype& b ) + { + return a - cxmake( b, 0 ); + } -inline __host__ __device__ cxtype -operator*( const cxtype& a, const fptype& b ) -{ - return a * cxmake( b, 0 ); -} + inline __host__ __device__ cxtype + operator*( const cxtype& a, const fptype& b ) + { + return a * cxmake( b, 0 ); + } -inline __host__ __device__ cxtype -operator/( const cxtype& a, const fptype& b ) -{ - return a / cxmake( b, 0 ); -} + inline __host__ __device__ cxtype + operator/( const cxtype& a, const fptype& b ) + { + return a / cxmake( b, 0 ); + } -inline __host__ __device__ cxtype -cxconj( const cxtype& c ) -{ - return cxmake( cxreal( c ), -cximag( c ) ); -} + inline __host__ __device__ cxtype + cxconj( const cxtype& c ) + { + return cxmake( cxreal( c ), -cximag( c ) ); + } -inline __host__ cxtype // NOT __device__ -cxmake( const std::complex& c ) // std::complex to cucomplex (float-to-float or double-to-double) -{ - return cxmake( c.real(), c.imag() ); -} + inline __host__ cxtype // NOT __device__ + cxmake( const std::complex& c ) // std::complex to cucomplex (float-to-float or double-to-double) + { + return cxmake( c.real(), c.imag() ); + } #endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX -//========================================================================== + //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) -//------------------------------ -// C++ - using std::complex -//------------------------------ + //------------------------------ + // C++ - using std::complex + //------------------------------ -inline cxtype -cxmake( const fptype& r, const fptype& i ) -{ - return cxtype( r, i ); // std::complex constructor -} + inline cxtype + cxmake( const fptype& r, const fptype& i ) + { + return cxtype( r, i ); // std::complex constructor + } -inline fptype -cxreal( const cxtype& c ) -{ - return c.real(); // std::complex::real() -} + inline fptype + cxreal( const cxtype& c ) + { + return c.real(); // std::complex::real() + } -inline fptype -cximag( const cxtype& c ) -{ - return c.imag(); // std::complex::imag() -} + inline fptype + cximag( const cxtype& c ) + { + return c.imag(); // std::complex::imag() + } -inline cxtype -cxconj( const cxtype& c ) -{ - return conj( c ); // conj( std::complex ) -} + inline cxtype + cxconj( const cxtype& c ) + { + return conj( c ); // conj( std::complex ) + } -inline const cxtype& -cxmake( const cxtype& c ) // std::complex to std::complex (float-to-float or double-to-double) -{ - return c; -} + inline const cxtype& + cxmake( const cxtype& c ) // std::complex to std::complex (float-to-float or double-to-double) + { + return c; + } #if defined MGONGPU_FPTYPE_FLOAT -inline cxtype -cxmake( const std::complex& c ) // std::complex to std::complex (cast double-to-float) -{ - return cxmake( (fptype)c.real(), (fptype)c.imag() ); -} + inline cxtype + cxmake( const std::complex& c ) // std::complex to std::complex (cast double-to-float) + { + return cxmake( (fptype)c.real(), (fptype)c.imag() ); + } #endif #endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX -//========================================================================== + //========================================================================== -inline __host__ __device__ const cxtype -cxmake( const cxsmpl& c ) // cxsmpl to cxtype (float-to-float or float-to-double) -{ - return cxmake( c.real(), c.imag() ); -} + inline __host__ __device__ const cxtype + cxmake( const cxsmpl& c ) // cxsmpl to cxtype (float-to-float or float-to-double) + { + return cxmake( c.real(), c.imag() ); + } -inline __host__ __device__ const cxtype -cxmake( const cxsmpl& c ) // cxsmpl to cxtype (double-to-float or double-to-double) -{ - return cxmake( c.real(), c.imag() ); -} + inline __host__ __device__ const cxtype + cxmake( const cxsmpl& c ) // cxsmpl to cxtype (double-to-float or double-to-double) + { + return cxmake( c.real(), c.imag() ); + } + +} // end namespace mg5amcGpu/mg5amcCpu //========================================================================== // COMPLEX TYPES: WRAPPER OVER RI FLOATING POINT PAIR (cxtype_ref) //========================================================================== -namespace mgOnGpu /* clang-format off */ +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { // The cxtype_ref class (a non-const reference to two fp variables) was originally designed for cxtype_v::operator[] // It used to be included in the code only when MGONGPU_HAS_CPPCXTYPEV_BRK (originally MGONGPU_HAS_CPPCXTYPE_REF) is defined @@ -611,23 +653,30 @@ namespace mgOnGpu /* clang-format off */ cxtype_ref() = delete; cxtype_ref( const cxtype_ref& ) = delete; cxtype_ref( cxtype_ref&& ) = default; // copy refs - __host__ __device__ cxtype_ref( fptype& r, fptype& i ) : m_preal( &r ), m_pimag( &i ) {} // copy refs + __host__ __device__ cxtype_ref( fptype& r, fptype& i ) + : m_preal( &r ), m_pimag( &i ) {} // copy refs cxtype_ref& operator=( const cxtype_ref& ) = delete; //__host__ __device__ cxtype_ref& operator=( cxtype_ref&& c ) {...} // REMOVED! Should copy refs or copy values? No longer needed in cxternary - __host__ __device__ cxtype_ref& operator=( const cxtype& c ) { *m_preal = cxreal( c ); *m_pimag = cximag( c ); return *this; } // copy values + __host__ __device__ cxtype_ref& operator=( const cxtype& c ) + { + *m_preal = cxreal( c ); + *m_pimag = cximag( c ); + return *this; + } // copy values __host__ __device__ operator cxtype() const { return cxmake( *m_preal, *m_pimag ); } private: fptype *m_preal, *m_pimag; // RI }; -} /* clang-format on */ -// Printout to stream for user defined types -inline __host__ __device__ std::ostream& -operator<<( std::ostream& out, const mgOnGpu::cxtype_ref& c ) -{ - out << (cxtype)c; - return out; -} + // Printout to stream for user defined types + inline __host__ __device__ std::ostream& + operator<<( std::ostream& out, const cxtype_ref& c ) + { + out << (cxtype)c; + return out; + } + +} // end namespace mg5amcGpu/mg5amcCpu //========================================================================== diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuFptypes.h index b278275f80..fa3a02664b 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuFptypes.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. + #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -6,82 +11,91 @@ #include #include -//========================================================================== - -#ifdef __CUDACC__ // cuda - -//------------------------------ -// Floating point types - Cuda -//------------------------------ - -/* -inline __host__ __device__ fptype -fpmax( const fptype& a, const fptype& b ) -{ - return max( a, b ); -} - -inline __host__ __device__ fptype -fpmin( const fptype& a, const fptype& b ) -{ - return min( a, b ); -} -*/ - -inline __host__ __device__ const fptype& -fpmax( const fptype& a, const fptype& b ) -{ - return ( ( b < a ) ? a : b ); -} - -inline __host__ __device__ const fptype& -fpmin( const fptype& a, const fptype& b ) -{ - return ( ( a < b ) ? a : b ); -} - -inline __host__ __device__ fptype -fpsqrt( const fptype& f ) +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL // cuda +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { + //========================================================================== + +#ifdef MGONGPUCPP_GPUIMPL // cuda + + //------------------------------ + // Floating point types - Cuda + //------------------------------ + + /* + inline __host__ __device__ fptype + fpmax( const fptype& a, const fptype& b ) + { + return max( a, b ); + } + + inline __host__ __device__ fptype + fpmin( const fptype& a, const fptype& b ) + { + return min( a, b ); + } + */ + + inline __host__ __device__ const fptype& + fpmax( const fptype& a, const fptype& b ) + { + return ( ( b < a ) ? a : b ); + } + + inline __host__ __device__ const fptype& + fpmin( const fptype& a, const fptype& b ) + { + return ( ( a < b ) ? a : b ); + } + + inline __host__ __device__ fptype + fpsqrt( const fptype& f ) + { #if defined MGONGPU_FPTYPE_FLOAT - // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html - return sqrtf( f ); + // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html + return sqrtf( f ); #else - // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__DOUBLE.html - return sqrt( f ); + // See https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__DOUBLE.html + return sqrt( f ); #endif -} + } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL -//========================================================================== + //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL -//------------------------------ -// Floating point types - C++ -//------------------------------ + //------------------------------ + // Floating point types - C++ + //------------------------------ -inline const fptype& -fpmax( const fptype& a, const fptype& b ) -{ - return std::max( a, b ); -} + inline const fptype& + fpmax( const fptype& a, const fptype& b ) + { + return std::max( a, b ); + } -inline const fptype& -fpmin( const fptype& a, const fptype& b ) -{ - return std::min( a, b ); -} + inline const fptype& + fpmin( const fptype& a, const fptype& b ) + { + return std::min( a, b ); + } -inline fptype -fpsqrt( const fptype& f ) -{ - return std::sqrt( f ); -} + inline fptype + fpsqrt( const fptype& f ) + { + return std::sqrt( f ); + } + +#endif // #ifndef MGONGPUCPP_GPUIMPL -#endif // #ifndef __CUDACC__ + //========================================================================== -//========================================================================== +} // end namespace mg5amcGpu/mg5amcCpu #endif // MGONGPUFPTYPES_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h index 0dd4c69bd4..cdae04326b 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuVectors.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Nov 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. + #ifndef MGONGPUVECTORS_H #define MGONGPUVECTORS_H 1 @@ -26,7 +31,12 @@ //#undef MGONGPU_HAS_CPPCXTYPEV_BRK // gcc test (very slightly slower? issue #172) #endif -namespace mgOnGpu /* clang-format off */ +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif { #ifdef MGONGPU_CPPSIMD @@ -67,22 +77,40 @@ namespace mgOnGpu /* clang-format off */ public: // Array initialization: zero-out as "{0}" (C and C++) or as "{}" (C++ only) // See https://en.cppreference.com/w/c/language/array_initialization#Notes - cxtype_v() : m_real{ 0 }, m_imag{ 0 } {} // RRRR=0000 IIII=0000 + cxtype_v() + : m_real{ 0 }, m_imag{ 0 } {} // RRRR=0000 IIII=0000 cxtype_v( const cxtype_v& ) = default; cxtype_v( cxtype_v&& ) = default; - cxtype_v( const fptype_v& r, const fptype_v& i ) : m_real( r ), m_imag( i ) {} - cxtype_v( const fptype_v& r ) : m_real( r ), m_imag{ 0 } {} // IIII=0000 + cxtype_v( const fptype_v& r, const fptype_v& i ) + : m_real( r ), m_imag( i ) {} + cxtype_v( const fptype_v& r ) + : m_real( r ), m_imag{ 0 } {} // IIII=0000 + cxtype_v( const fptype& r ) + : m_real( fptype_v{} + r ), m_imag{ 0 } {} // IIII=0000 cxtype_v& operator=( const cxtype_v& ) = default; cxtype_v& operator=( cxtype_v&& ) = default; - cxtype_v& operator+=( const cxtype_v& c ) { m_real += c.real(); m_imag += c.imag(); return *this; } - cxtype_v& operator-=( const cxtype_v& c ) { m_real -= c.real(); m_imag -= c.imag(); return *this; } + cxtype_v& operator+=( const cxtype_v& c ) + { + m_real += c.real(); + m_imag += c.imag(); + return *this; + } + cxtype_v& operator-=( const cxtype_v& c ) + { + m_real -= c.real(); + m_imag -= c.imag(); + return *this; + } #ifdef MGONGPU_HAS_CPPCXTYPEV_BRK // NB: THIS IS THE FUNDAMENTAL DIFFERENCE BETWEEN MGONGPU_HAS_CPPCXTYPEV_BRK DEFINED AND NOT DEFINED // NB: the alternative "clang" implementation is simpler: it simply does not have any bracket operator[] // NB: ** do NOT implement operator[] to return a value: it does not fail the build (why?) and gives unexpected results! ** cxtype_ref operator[]( size_t i ) const { return cxtype_ref( m_real[i], m_imag[i] ); } #endif - const fptype_v& real() const { return m_real; } + const fptype_v& real() const + { + return m_real; + } const fptype_v& imag() const { return m_imag; } private: fptype_v m_real, m_imag; // RRRRIIII @@ -93,7 +121,7 @@ namespace mgOnGpu /* clang-format off */ #if defined MGONGPU_FPTYPE_DOUBLE typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #elif defined MGONGPU_FPTYPE_FLOAT - typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb + typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb #endif #else // gcc #if defined MGONGPU_FPTYPE_DOUBLE @@ -103,537 +131,570 @@ namespace mgOnGpu /* clang-format off */ #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; #endif // #ifdef MGONGPU_CPPSIMD - -} /* clang-format on */ +} //-------------------------------------------------------------------------- -// Expose typedefs outside the namespace -using mgOnGpu::neppV; -#ifdef MGONGPU_CPPSIMD -using mgOnGpu::fptype_v; -using mgOnGpu::fptype2_v; -using mgOnGpu::cxtype_v; -using mgOnGpu::bool_v; -#endif +// DANGEROUS! this was mixing different cxtype definitions for CPU and GPU builds (see #318 and #725) +// DO NOT expose typedefs outside the namespace +//using mgOnGpu::neppV; +//#ifdef MGONGPU_CPPSIMD +//using mgOnGpu::fptype_v; +//using mgOnGpu::fptype2_v; +//using mgOnGpu::cxtype_v; +//using mgOnGpu::bool_v; +//#endif -//-------------------------------------------------------------------------- +//========================================================================== -#ifndef __CUDACC__ +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ +#ifndef MGONGPUCPP_GPUIMPL -// Printout to stream for user defined types + // Printout to stream for user defined types #ifndef MGONGPU_CPPCXTYPE_CXSMPL // operator<< for cxsmpl has already been defined! -inline std::ostream& -operator<<( std::ostream& out, const cxtype& c ) -{ - out << "[" << cxreal( c ) << "," << cximag( c ) << "]"; - //out << cxreal(c) << "+i" << cximag(c); - return out; -} + inline std::ostream& + operator<<( std::ostream& out, const cxtype& c ) + { + out << "[" << cxreal( c ) << "," << cximag( c ) << "]"; + //out << cxreal(c) << "+i" << cximag(c); + return out; + } #endif -/* + /* #ifdef MGONGPU_CPPSIMD -inline std::ostream& -operator<<( std::ostream& out, const bool_v& v ) -{ - out << "{ " << v[0]; - for ( int i=1; i 0 ) outi = fpsqrt( (fptype)v[i] ); + out[i] = outi; + } + return out; + } + + inline fptype_v + fpsqrt( const fptype_v& v ) + { + // See https://stackoverflow.com/questions/18921049/gcc-vector-extensions-sqrt + fptype_v out = {}; // avoid warning 'out' may be used uninitialized: see #594 + for( int i = 0; i < neppV; i++ ) out[i] = fpsqrt( v[i] ); + return out; + } #endif -/* + /* #ifdef MGONGPU_CPPSIMD -inline fptype_v -fpvmake( const fptype v[neppV] ) -{ - fptype_v out = {}; // see #594 - for ( int i=0; i #include #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { - using mgOnGpu::np4; - using mgOnGpu::npari; - using mgOnGpu::nparf; - using mgOnGpu::npar; + constexpr int np4 = CPPProcess::np4; // dimensions of 4-momenta (E,px,py,pz) + constexpr int npari = CPPProcess::npari; // #particles in the initial state (incoming): e.g. 2 (e+ e-) for e+ e- -> mu+ mu- + constexpr int nparf = CPPProcess::nparf; // #particles in the final state (outgoing): e.g. 2 (mu+ mu-) for e+ e- -> mu+ mu- + constexpr int npar = CPPProcess::npar; // #particles in total (external = initial + final): e.g. 4 for e+ e- -> mu+ mu- //-------------------------------------------------------------------------- @@ -72,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -155,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.cc b/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.cc index 2934e3a476..f8e46f2e66 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.cc @@ -1,8 +1,21 @@ +// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. +// Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend. +//========================================================================== +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +//========================================================================== + #include "read_slha.h" #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -51,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.h b/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.h index feb8b43b5a..c6f7dd8adc 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/read_slha.h @@ -1,3 +1,12 @@ +// Copyright (C) 2010 The MadGraph5_aMC@NLO development team and contributors. +// Created by: J. Alwall (Sep 2010) for the MG5aMC CPP backend. +//========================================================================== +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +//========================================================================== + #ifndef READ_SLHA_H #define READ_SLHA_H 1 From 55bfb545f9b0af748ccfc3170c8302d07624f459 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 08:01:13 +0100 Subject: [PATCH 09/96] [susy2] add many missing files to the newly generated susy_gg_tt.sa Previously untracked files, now added with git add: susy_gg_tt.sa/.gitignore susy_gg_tt.sa/COPYING susy_gg_tt.sa/COPYING.LESSER susy_gg_tt.sa/COPYRIGHT susy_gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc susy_gg_tt.sa/SubProcesses/CommonRandomNumbers.h susy_gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc susy_gg_tt.sa/SubProcesses/GpuAbstraction.h susy_gg_tt.sa/SubProcesses/GpuRuntime.h susy_gg_tt.sa/SubProcesses/HiprandRandomNumberKernel.cc susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/.gitignore susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CommonRandomNumberKernel.cc susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CommonRandomNumbers.h susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CurandRandomNumberKernel.cc susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/GpuAbstraction.h susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/GpuRuntime.h susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/HiprandRandomNumberKernel.cc susy_gg_tt.sa/mg5.in susy_gg_tt.sa/test/ --- epochX/cudacpp/susy_gg_tt.sa/.gitignore | 6 + .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 15 +- epochX/cudacpp/susy_gg_tt.sa/COPYING | 674 ++++++++++++++++++ epochX/cudacpp/susy_gg_tt.sa/COPYING.LESSER | 165 +++++ epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT | 57 ++ .../SubProcesses/CommonRandomNumberKernel.cc | 38 + .../SubProcesses/CommonRandomNumbers.h | 96 +++ .../SubProcesses/CurandRandomNumberKernel.cc | 135 ++++ .../SubProcesses/GpuAbstraction.h | 69 ++ .../susy_gg_tt.sa/SubProcesses/GpuRuntime.h | 85 +++ .../SubProcesses/HiprandRandomNumberKernel.cc | 145 ++++ .../P1_Sigma_MSSM_SLHA2_gg_ttx/.gitignore | 12 + .../CommonRandomNumberKernel.cc | 1 + .../CommonRandomNumbers.h | 1 + .../CurandRandomNumberKernel.cc | 1 + .../GpuAbstraction.h | 1 + .../P1_Sigma_MSSM_SLHA2_gg_ttx/GpuRuntime.h | 1 + .../HiprandRandomNumberKernel.cc | 1 + epochX/cudacpp/susy_gg_tt.sa/mg5.in | 5 + .../susy_gg_tt.sa/test/cudacpp_test.mk | 40 ++ epochX/cudacpp/susy_gg_tt.sa/test/makefile | 1 + epochX/cudacpp/susy_gg_tt.sa/test/ref/.keepme | 0 22 files changed, 1540 insertions(+), 9 deletions(-) create mode 100644 epochX/cudacpp/susy_gg_tt.sa/.gitignore create mode 100644 epochX/cudacpp/susy_gg_tt.sa/COPYING create mode 100644 epochX/cudacpp/susy_gg_tt.sa/COPYING.LESSER create mode 100644 epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CommonRandomNumbers.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/HiprandRandomNumberKernel.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/.gitignore create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CommonRandomNumberKernel.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CommonRandomNumbers.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CurandRandomNumberKernel.cc create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/GpuAbstraction.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/GpuRuntime.h create mode 120000 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/HiprandRandomNumberKernel.cc create mode 100644 epochX/cudacpp/susy_gg_tt.sa/mg5.in create mode 100644 epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk create mode 120000 epochX/cudacpp/susy_gg_tt.sa/test/makefile create mode 100644 epochX/cudacpp/susy_gg_tt.sa/test/ref/.keepme diff --git a/epochX/cudacpp/susy_gg_tt.sa/.gitignore b/epochX/cudacpp/susy_gg_tt.sa/.gitignore new file mode 100644 index 0000000000..803024e1c8 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/.gitignore @@ -0,0 +1,6 @@ +crossx.html +index.html +results.dat* +results.pkl +run_[0-9]* +events.lhe* diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 118f88ad3e..1d7052d0ab 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -59,9 +59,6 @@ set stdout_level DEBUG set output information to level: 10 set zerowidth_tchannel F import model MSSM_SLHA2 -INFO: load particles -INFO: load vertices -DEBUG: model prefixing takes 0.9187817573547363  INFO: Restrict model MSSM_SLHA2 with file models/MSSM_SLHA2/restrict_default.dat . INFO: Detect SLHA2 format. keeping restricted parameter in the param_card DEBUG: Simplifying conditional expressions  @@ -557,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.110 s +1 processes with 3 diagrams generated in 0.117 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -585,7 +582,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.140 s +ALOHA: aloha creates 2 routines in 0.139 s VVV1 FFV1 FFV1 @@ -600,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m2.624s -user 0m2.440s -sys 0m0.084s -Code generation completed in 3 seconds +real 0m1.273s +user 0m1.199s +sys 0m0.065s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/COPYING b/epochX/cudacpp/susy_gg_tt.sa/COPYING new file mode 100644 index 0000000000..f288702d2f --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/epochX/cudacpp/susy_gg_tt.sa/COPYING.LESSER b/epochX/cudacpp/susy_gg_tt.sa/COPYING.LESSER new file mode 100644 index 0000000000..0a041280bd --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/COPYING.LESSER @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT b/epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT new file mode 100644 index 0000000000..9036d9260a --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/COPYRIGHT @@ -0,0 +1,57 @@ +Copyright (C) 2020-2023 CERN and UCLouvain. +Licensed under the GNU Lesser General Public License (version 3 or later). +All rights not expressly granted are reserved. + +The copyright and license notice above cover the CUDACPP code-generating plugin +of the MadGraph5_aMC@NLO (in the following "MG5aMC") software, and all code +generated using that plugin. These are collectively referred to as "this work" +or "the MG5aMC CUDACPP plugin and the code that it generates", or more simply +as "the MG5aMC CUDACPP plugin", in the following and throughout this work. + +The MG5aMC CUDACPP plugin and the code that it generates are based on the +initial work on porting MG5aMC to GPUs using CUDA and on speeding up MG5aMC on +CPUs using vectorized C++ by three original authors from CERN and UCLouvain. +The full development team currently includes the following authors : + Stephan Hageboeck (CERN) + Olivier Mattelaer (Universite Catholique de Louvain, original author) + Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) + Andrea Valassi (CERN, original author) + Zenny Wettersten (CERN) +See https://github.com/madgraph5/madgraph4gpu for more details. For the full +list of authors and collaborators of this work, see the file "AUTHORS" in the +same directory as this "COPYRIGHT" file in the source code of the plugin. + +The MG5aMC CUDACPP plugin and the code that it generates are derived from, and +are intended to be used in combination with, the MG5aMC software and the code +that it generates. The MG5aMC software is developed by the MadGraph5_aMC@NLO +development team and contributors, also known as the "MadTeam", who are the +owners of its copyright and have licensed it as specified in +https://github.com/mg5amcnlo/mg5amcnlo/blob/main/madgraph/LICENSE. +For the full list of authors and contributors of the MG5aMC software, see +https://github.com/mg5amcnlo/mg5amcnlo/blob/main/madgraph/AUTHORS. + +The MG5aMC CUDACPP plugin and the code that it generates are free software; +you can redistribute them and/or modify them under the terms of the GNU Lesser +General Public License as published by the Free Software Foundation, either +version 3 or (at your option) any later version. + +This work is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. + +The GNU Lesser General Public License (LGPL) version 3 is copied verbatim in +the file "COPYING.LESSER" in the same directory as this "COPYRIGHT" file. It is +also available at . + +This version of the GNU Lesser General Public License incorporates the terms +and conditions of version 3 of the GNU General Public License (GPL), which is +copied verbatim in the file "COPYING" in the same directory as this "COPYRIGHT" +file and is also available at . + +In line with the license above, the authors emphasise the following points. For +the developers' and authors' protection, the GPL clearly explains that there is +no warranty for this free software. For both users' and authors' sake, the GPL +requires that modified versions be marked as changed, so that their problems +will not be attributed erroneously to authors of previous versions. + diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc new file mode 100644 index 0000000000..010bc4cbd0 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -0,0 +1,38 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + +#include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- + + CommonRandomNumberKernel::CommonRandomNumberKernel( BufferRndNumMomenta& rnarray ) + : RandomNumberKernelBase( rnarray ) + , m_seed( 20211220 ) + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "CommonRandomNumberKernel on host with a device random number array" ); + } + + //-------------------------------------------------------------------------- + + void CommonRandomNumberKernel::generateRnarray() + { + std::vector rnd = CommonRandomNumbers::generate( m_rnarray.size(), m_seed ); // NB: generate as double (HARDCODED) + std::copy( rnd.begin(), rnd.end(), m_rnarray.data() ); // NB: copy may imply a double-to-float conversion + } + + //-------------------------------------------------------------------------- +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CommonRandomNumbers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CommonRandomNumbers.h new file mode 100644 index 0000000000..410b332c48 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CommonRandomNumbers.h @@ -0,0 +1,96 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2023) for the MG5aMC CUDACPP plugin. + +#ifndef COMMONRANDOMNUMBERS_H_ +#define COMMONRANDOMNUMBERS_H_ 1 + +#include +#include +#include +#include + +namespace CommonRandomNumbers +{ + + /// Create `n` random numbers using simple c++ engine. + template + std::vector generate( std::size_t n, std::minstd_rand::result_type seed = 1337 ) + { + std::vector result; + result.reserve( n ); + + std::minstd_rand generator( seed ); + std::uniform_real_distribution distribution( 0.0, 1.0 ); + + for( std::size_t i = 0; i < n; ++i ) + { + result.push_back( distribution( generator ) ); + } + + return result; + } + + /// Create `nBlock` blocks of random numbers. + /// Each block uses a generator that's seeded with `seed + blockIndex`, and blocks are generated in parallel. + template + std::vector> generateParallel( std::size_t nPerBlock, std::size_t nBlock, std::minstd_rand::result_type seed = 1337 ) + { + std::vector> results( nBlock ); + std::vector threads; + const auto partPerThread = nBlock / std::thread::hardware_concurrency() + ( nBlock % std::thread::hardware_concurrency() != 0 ); + + auto makeBlock = [nPerBlock, nBlock, seed, &results]( std::size_t partitionBegin, std::size_t partitionEnd ) + { + for( std::size_t partition = partitionBegin; partition < partitionEnd && partition < nBlock; ++partition ) + { + results[partition] = generate( nPerBlock, seed + partition ); + } + }; + + for( unsigned int threadId = 0; threadId < std::thread::hardware_concurrency(); ++threadId ) + { + threads.emplace_back( makeBlock, threadId * partPerThread, ( threadId + 1 ) * partPerThread ); + } + + for( auto& thread: threads ) + { + thread.join(); + } + + return results; + } + + /// Starts asynchronous generation of random numbers. This uses as many threads as cores, and generates blocks of random numbers. + /// These become available at unspecified times, but the blocks 0, 1, 2, ... are generated first. + /// Each block is seeded with seed + blockIndex to generate stable sequences. + /// \param[in/out] promises Vector of promise objects storing blocks of random numbers. + /// \param[in] nPerBlock Configures number of entries generated per block. + /// \param[in] nBlock Configures the number of blocks generated. + /// \param[in] nThread Optional concurrency. + /// \param[in] seed Optional seed. + template + void startGenerateAsync( std::vector>>& promises, std::size_t nPerBlock, std::size_t nBlock, unsigned int nThread = std::thread::hardware_concurrency(), std::minstd_rand::result_type seed = 1337 ) + { + promises.resize( nBlock ); + std::vector threads; + + auto makeBlocks = [=, &promises]( std::size_t threadID ) + { + for( std::size_t partition = threadID; partition < nBlock; partition += nThread ) + { + auto values = generate( nPerBlock, seed + partition ); + promises[partition].set_value( std::move( values ) ); + } + }; + + for( unsigned int threadId = 0; threadId < nThread; ++threadId ) + { + std::thread( makeBlocks, threadId ).detach(); + } + } + +} + +#endif /* COMMONRANDOMNUMBERS_H_ */ diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc new file mode 100644 index 0000000000..c160c5e06b --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -0,0 +1,135 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_CURAND /* clang-format off */ +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined +#include "curand.h" +#define checkCurand( code ){ assertCurand( code, __FILE__, __LINE__ ); } +inline void assertCurand( curandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != CURAND_STATUS_SUCCESS ) + { + printf( "CurandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == CURAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_CURAND + CurandRandomNumberKernel::CurandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "CurandRandomNumberKernel does not support CurandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "CurandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + CurandRandomNumberKernel::~CurandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void CurandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkCurand( curandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void CurandRandomNumberKernel::createGenerator() + { + // [NB Timings are for GenRnGen host|device (cpp|cuda) generation of 256*32*1 events with nproc=1: rn(0) is host=0.0012s] + const curandRngType_t type = CURAND_RNG_PSEUDO_MTGP32; // 0.00082s | 0.00064s (FOR FAST TESTS) + //const curandRngType_t type = CURAND_RNG_PSEUDO_XORWOW; // 0.049s | 0.0016s + //const curandRngType_t type = CURAND_RNG_PSEUDO_MRG32K3A; // 0.71s | 0.0012s (better but slower, especially in c++) + //const curandRngType_t type = CURAND_RNG_PSEUDO_MT19937; // 21s | 0.021s + //const curandRngType_t type = CURAND_RNG_PSEUDO_PHILOX4_32_10; // 0.024s | 0.00026s (used to segfault?) + if( m_isOnDevice ) + { + checkCurand( curandCreateGenerator( &m_rnGen, type ) ); + } + else + { + checkCurand( curandCreateGeneratorHost( &m_rnGen, type ) ); + } + //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_LEGACY ) ); // fails with code=104 (see #429) + checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_BEST ) ); + //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_DYNAMIC ) ); // fails with code=104 (see #429) + //checkCurand( curandSetGeneratorOrdering( *&m_rnGen, CURAND_ORDERING_PSEUDO_SEEDED ) ); // fails with code=104 (see #429) + } + + //-------------------------------------------------------------------------- + + void CurandRandomNumberKernel::destroyGenerator() + { + checkCurand( curandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void CurandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkCurand( curandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkCurand( curandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/HiprandRandomNumberKernel.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/HiprandRandomNumberKernel.cc new file mode 100644 index 0000000000..2e4534f9d4 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/HiprandRandomNumberKernel.cc @@ -0,0 +1,145 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Jan 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#include "mgOnGpuConfig.h" + +#include "GpuRuntime.h" +#include "MemoryBuffers.h" +#include "RandomNumberKernels.h" + +#include + +#ifndef MGONGPU_HAS_NO_HIPRAND /* clang-format off */ +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ 1 // enable hiprand for AMD (rocrand) +#endif +#include +#define checkHiprand( code ){ assertHiprand( code, __FILE__, __LINE__ ); } +inline void assertHiprand( hiprandStatus_t code, const char *file, int line, bool abort = true ) +{ + if ( code != HIPRAND_STATUS_SUCCESS ) + { + printf( "HiprandAssert: %s:%d code=%d\n", file, line, code ); + if ( abort ) assert( code == HIPRAND_STATUS_SUCCESS ); + } +} +#endif /* clang-format on */ + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + //-------------------------------------------------------------------------- +#ifndef MGONGPU_HAS_NO_HIPRAND + HiprandRandomNumberKernel::HiprandRandomNumberKernel( BufferRndNumMomenta& rnarray, const bool onDevice ) + : RandomNumberKernelBase( rnarray ) + , m_isOnDevice( onDevice ) + { + if( m_isOnDevice ) + { +#ifdef MGONGPUCPP_GPUIMPL + if( !m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on device with a host random number array" ); +#else + throw std::runtime_error( "HiprandRandomNumberKernel does not support HiprandDevice on CPU host" ); +#endif + } + else + { + if( m_rnarray.isOnDevice() ) + throw std::runtime_error( "HiprandRandomNumberKernel on host with a device random number array" ); + } + createGenerator(); + } + + //-------------------------------------------------------------------------- + + HiprandRandomNumberKernel::~HiprandRandomNumberKernel() + { + destroyGenerator(); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::seedGenerator( const unsigned int seed ) + { + if( m_isOnDevice ) + { + destroyGenerator(); // workaround for #429 + createGenerator(); // workaround for #429 + } + //printf( "seedGenerator: seed %d\n", seed ); + checkHiprand( hiprandSetPseudoRandomGeneratorSeed( m_rnGen, seed ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::createGenerator() + { + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_DEFAULT; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_XORWOW; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MRG32K3A; + const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MTGP32; // same as curand; not implemented yet (code=1000) in host code + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_MT19937; + //const hiprandRngType_t type = HIPRAND_RNG_PSEUDO_PHILOX4_32_10; + if( m_isOnDevice ) + { + checkHiprand( hiprandCreateGenerator( &m_rnGen, type ) ); + } + else + { + // See https://github.com/ROCm/hipRAND/issues/76 + throw std::runtime_error( "HiprandRandomNumberKernel on host is not supported yet (hiprandCreateGeneratorHost is not implemented yet)" ); + //checkHiprand( hiprandCreateGeneratorHost( &m_rnGen, type ) ); // ALWAYS FAILS WITH CODE=1000 + } + // FIXME: hiprand ordering is not implemented yet + // See https://github.com/ROCm/hipRAND/issues/75 + /* + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_LEGACY ) ); + checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_BEST ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_DYNAMIC ) ); + //checkHiprand( hiprandSetGeneratorOrdering( *&m_rnGen, HIPRAND_ORDERING_PSEUDO_SEEDED ) ); + */ + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::destroyGenerator() + { + checkHiprand( hiprandDestroyGenerator( m_rnGen ) ); + } + + //-------------------------------------------------------------------------- + + void HiprandRandomNumberKernel::generateRnarray() + { +#if defined MGONGPU_FPTYPE_DOUBLE + checkHiprand( hiprandGenerateUniformDouble( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkHiprand( hiprandGenerateUniform( m_rnGen, m_rnarray.data(), m_rnarray.size() ) ); +#endif + /* + printf( "\nHiprandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); + fptype* data = m_rnarray.data(); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) + { + data = new fptype[m_rnarray.size()](); + checkCuda( cudaMemcpy( data, m_rnarray.data(), m_rnarray.bytes(), cudaMemcpyDeviceToHost ) ); + } +#endif + for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) + printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); +#ifdef MGONGPUCPP_GPUIMPL + if( m_rnarray.isOnDevice() ) delete[] data; +#endif + */ + } + + //-------------------------------------------------------------------------- +#endif +} diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/.gitignore b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/.gitignore new file mode 100644 index 0000000000..7fc2433954 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/.gitignore @@ -0,0 +1,12 @@ +.libs +.cudacpplibs +madevent +madevent_fortran +madevent_cpp +madevent_cuda + +G[0-9]* +ajob[0-9]* +input_app.txt +symfact.dat +gensym diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CommonRandomNumberKernel.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CommonRandomNumberKernel.cc new file mode 120000 index 0000000000..c7ce22d0a1 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CommonRandomNumberKernel.cc @@ -0,0 +1 @@ +../CommonRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CommonRandomNumbers.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CommonRandomNumbers.h new file mode 120000 index 0000000000..50b45ccea8 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CommonRandomNumbers.h @@ -0,0 +1 @@ +../CommonRandomNumbers.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CurandRandomNumberKernel.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CurandRandomNumberKernel.cc new file mode 120000 index 0000000000..b8b4406ed2 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CurandRandomNumberKernel.cc @@ -0,0 +1 @@ +../CurandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/GpuRuntime.h b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/HiprandRandomNumberKernel.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/HiprandRandomNumberKernel.cc new file mode 120000 index 0000000000..6691864f78 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/HiprandRandomNumberKernel.cc @@ -0,0 +1 @@ +../HiprandRandomNumberKernel.cc \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/mg5.in b/epochX/cudacpp/susy_gg_tt.sa/mg5.in new file mode 100644 index 0000000000..61f549c86a --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/mg5.in @@ -0,0 +1,5 @@ +set stdout_level DEBUG +set zerowidth_tchannel F +import model MSSM_SLHA2 +generate g g > t t~ +output standalone_cudacpp susy_gg_tt.sa diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk new file mode 100644 index 0000000000..39ed957600 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/test/cudacpp_test.mk @@ -0,0 +1,40 @@ +# Copyright (C) 2020-2023 CERN and UCLouvain. +# Licensed under the GNU Lesser General Public License (version 3 or later). +# Created by: S. Hageboeck (Dec 2020) for the CUDACPP plugin. +# Modified by: A. Valassi (2020-2023) for the CUDACPP plugin. + +THISDIR = $(dir $(abspath $(lastword $(MAKEFILE_LIST)))) + +# Compiler-specific googletest build directory (#125 and #738) +# In epochX, CXXNAMESUFFIX=_$(CXXNAME) is exported from cudacpp.mk +# In epoch1/epoch2, CXXNAMESUFFIX is undefined +$(info CXXNAMESUFFIX=$(CXXNAMESUFFIX)) +BUILDDIR = build$(CXXNAMESUFFIX) +###$(info BUILDDIR=$(BUILDDIR)) +INSTALLDIR = install$(CXXNAMESUFFIX) +###$(info INSTALLDIR=$(INSTALLDIR)) + +CXXFLAGS += -Igoogletest/googletest/include/ -std=c++11 + +all: googletest/$(INSTALLDIR)/lib64/libgtest.a + +googletest/CMakeLists.txt: + git clone https://github.com/google/googletest.git -b release-1.11.0 googletest + +googletest/$(BUILDDIR)/Makefile: googletest/CMakeLists.txt + mkdir -p googletest/$(BUILDDIR) + cd googletest/$(BUILDDIR) && cmake -DCMAKE_INSTALL_PREFIX:PATH=$(THISDIR)/googletest/install -DBUILD_GMOCK=OFF ../ + +googletest/$(BUILDDIR)/lib/libgtest.a: googletest/$(BUILDDIR)/Makefile + $(MAKE) -C googletest/$(BUILDDIR) + +# NB 'make install' is no longer supported in googletest (issue 328) +# NB keep 'lib64' instead of 'lib' as in LCG cvmfs installations +googletest/$(INSTALLDIR)/lib64/libgtest.a: googletest/$(BUILDDIR)/lib/libgtest.a + mkdir -p googletest/$(INSTALLDIR)/lib64 + cp googletest/$(BUILDDIR)/lib/lib*.a googletest/$(INSTALLDIR)/lib64/ + mkdir -p googletest/$(INSTALLDIR)/include + cp -r googletest/googletest/include/gtest googletest/$(INSTALLDIR)/include/ + +clean: + rm -rf googletest diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/makefile b/epochX/cudacpp/susy_gg_tt.sa/test/makefile new file mode 120000 index 0000000000..e9c5fd8b46 --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/test/makefile @@ -0,0 +1 @@ +cudacpp_test.mk \ No newline at end of file diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/ref/.keepme b/epochX/cudacpp/susy_gg_tt.sa/test/ref/.keepme new file mode 100644 index 0000000000..e69de29bb2 From bc66488487c80311aabc5359e8ab8a6684cddebc Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 08:06:17 +0100 Subject: [PATCH 10/96] [susy2] in susyggtt.sa add constexpr to cxsmmpl::conj function (now valid code? In the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") Cherry-pick commit 0fd793b9630667768a303b32112f0c237920e6e5 from Mon Apr 3 13:55:33 2023 +0200, no conflicts Change the date with 'git commit --amend --date="$(date -R)"' The above comments come from the original commit. Updates on 14-Feb-2024 below. 'make HRDCOD=1' fails with ccache /cvmfs/sft.cern.ch/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/bin/g++ -O3 -std=c++17 -I. -fPIC -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -DMGONGPU_HARDCODE_PARAM -fPIC -c Parameters_MSSM_SLHA2.cc -o Parameters_MSSM_SLHA2.o In file included from Parameters_MSSM_SLHA2.cc:15: Parameters_MSSM_SLHA2.h:758:5: error: expected unqualified-id before 'if' 758 | if( mdl_Mneu2 < 0 ) | ^~ --- epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); From 098fc731692fa42219c9d4f28a1a036576f50da3 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 08:07:01 +0100 Subject: [PATCH 11/96] [susy2] in susyggtt.sa Parameters.h, fix constexpr fixes for Majorana particles in HRDCOD=1 Cherry-pick commit bd5db8fcee5d9c7940fab36962de40651d37ebc3 from Mon Apr 3 14:06:20 2023 +0200, fix conflicts with HIP support PR Change the date with 'git commit --amend --date="$(date -R)"' The above comments come from the original commit. Updates on 14-Feb-2024 below. 'make HRDCOD=1' fails with ccache /cvmfs/sft.cern.ch/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/bin/g++ -O3 -std=c++17 -I. -fPIC -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -DMGONGPU_HARDCODE_PARAM -fPIC -c Parameters_MSSM_SLHA2.cc -o Parameters_MSSM_SLHA2.o In file included from Parameters_MSSM_SLHA2.cc:15: Parameters_MSSM_SLHA2.h: In function 'const mg5amcCpu::Parameters_MSSM_SLHA2_dependentCouplings::DependentCouplings_sv mg5amcCpu::Parameters_MSSM_SLHA2_dependentCouplings::computeDependentCouplings_fromG(const mg5amcCpu::fptype_sv&)': Parameters_MSSM_SLHA2.h:839:58: error: conversion from 'mg5amcCpu::fptype_sv' {aka '__vector(4) double'} to non-scalar type 'const mgOnGpu::cxsmpl' requested 839 | constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); | ~~~~~~~~^~~~~~~~~ Parameters_MSSM_SLHA2.h:839:58: error: could not convert '(((mg5amcCpu::fptype_sv)G) * ((mg5amcCpu::fptype_sv)G))' from 'mg5amcCpu::fptype_sv' {aka '__vector(4) double'} to 'const mgOnGpu::cxsmpl' 839 | constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); | ~~~~~~~~^~~~~~~~~ | | | mg5amcCpu::fptype_sv {aka __vector(4) double} --- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 8f3d4c4241..0e4a55d314 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -148,11 +148,11 @@ namespace mg5amcCpu constexpr double mdl_Wsu4 = 1.152973e+00; constexpr double mdl_Wsd4 = 2.858123e-01; constexpr double mdl_Wch2 = 2.486895e+00; - constexpr double mdl_Wneu4 = 2.585851e+00; - constexpr double mdl_Wneu3 = 1.915985e+00; + constexpr double mdl_Wneu4_abs = 2.585851e+00; + constexpr double mdl_Wneu3_abs = 1.915985e+00; constexpr double mdl_Wch1 = 1.704145e-02; - constexpr double mdl_Wneu2 = 2.077700e-02; - constexpr double mdl_Wgo = 5.506754e+00; + constexpr double mdl_Wneu2_abs = 2.077700e-02; + constexpr double mdl_Wgo_abs = 5.506754e+00; constexpr double mdl_Wsn3 = 1.475190e-01; constexpr double mdl_Wsl3 = 1.483273e-01; constexpr double mdl_Wsn2 = 1.498816e-01; @@ -755,14 +755,16 @@ namespace mg5amcCpu constexpr double mdl_vd = mdl_vev * mdl_cos__beta; constexpr double mdl_vu = mdl_vev * mdl_sin__beta; constexpr double mdl_ee__exp__2 = ( ( mdl_ee ) * ( mdl_ee ) ); - if( mdl_Mneu2 < 0 ) - mdl_Wneu2 = -abs( mdl_Wneu2 ); - if( mdl_Mneu3 < 0 ) - mdl_Wneu3 = -abs( mdl_Wneu3 ); - if( mdl_Mneu4 < 0 ) - mdl_Wneu4 = -abs( mdl_Wneu4 ); - if( mdl_Mgo < 0 ) - mdl_Wgo = -abs( mdl_Wgo ); + // Fixes for Majorana particles + constexpr int mdl_Wneu2_sign = ( mdl_Mneu2 < 0 ? -1 : + 1 ); + constexpr int mdl_Wneu3_sign = ( mdl_Mneu3 < 0 ? -1 : + 1 ); + constexpr int mdl_Wneu4_sign = ( mdl_Mneu4 < 0 ? -1 : + 1 ); + constexpr int mdl_Wgo_sign = ( mdl_Mgo < 0 ? -1 : + 1 ); + constexpr double mdl_Wneu2 = mdl_Wneu2_sign * mdl_Wneu2_abs; + constexpr double mdl_Wneu3 = mdl_Wneu3_sign * mdl_Wneu3_abs; + constexpr double mdl_Wneu4 = mdl_Wneu4_sign * mdl_Wneu4_abs; + constexpr double mdl_Wgo = mdl_Wgo_sign * mdl_Wgo_abs; + // Model couplings independent of aS // (none) From 415e27bcefd62f31bde199618311ee0d17342a12 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 08:07:17 +0100 Subject: [PATCH 12/96] [susy2] in susyggtt.sa Parameters.h, fix mdl_G__exp__2 as in SM ggtt.sa (why is this different here??) Replace constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); by const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); Cherry-pick commit 36dfe0591c31b27488c5f140042e88771b259e81 from Mon Apr 3 14:09:36 2023 +0200, fix a conflict with HIP support PR Change the date with 'git commit --amend --date="$(date -R)"' The above comments come from the original commit. Updates on 14-Feb-2024 below. 'make HRDCOD=1' fails with ccache /usr/local/cuda-12.0/bin/nvcc -Xcompiler -O3 -lineinfo -I. -I../../src -I/usr/local/cuda-12.0/include/ -DUSE_NVTX -gencode arch=compute_70,code=compute_70 -gencode arch=compute_70,code=sm_70 -use_fast_math -std=c++17 -ccbin /cvmfs/sft.cern.ch/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/bin/g++ -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -DMGONGPU_HARDCODE_PARAM -Xcompiler -fPIC -c -x cu Parameters_MSSM_SLHA2.cc -o Parameters_MSSM_SLHA2_cu.o Parameters_MSSM_SLHA2.h(333): error: expression must have a constant value Parameters_MSSM_SLHA2.h(333): note #2703-D: cannot call non-constexpr function "atan(double) noexcept(true)" /usr/local/cuda-12.0/include/crt/math_functions.h(4137): here Parameters_MSSM_SLHA2.h(748): error: expression must have a constant value Parameters_MSSM_SLHA2.h(748): note #2703-D: cannot call non-constexpr function "cos(double) noexcept(true)" /usr/local/cuda-12.0/include/crt/math_functions.h(553): here Parameters_MSSM_SLHA2.h(749): error: expression must have a constant value Parameters_MSSM_SLHA2.h(749): note #2703-D: cannot call non-constexpr function "sin(double) noexcept(true)" /usr/local/cuda-12.0/include/crt/math_functions.h(520): here 3 errors detected in the compilation of "Parameters_MSSM_SLHA2.cc". Note however that 'CUDA_HOME=none make HRDCOD=1 -j' does succeed! This means that constexpr sin, cos, atan exist in gcc but not in nvcc? Maybe they can be simply hardcoded for nvcc as they are for gcc? Also, check.exe succeeds, while runTest.exe fails because the reference logfile is missing. Note also: cannot cherry-pick commit 586195dced571514e6f5f903c2e4b4b7fb7fe9af from Mon Apr 3 14:13:36 2023 +0200 This would result in an empty commit: I imagine that the upstream mg5anlo has modified this part of the generated code --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 0e4a55d314..80be641274 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -836,7 +836,7 @@ namespace mg5amcCpu // Model parameters dependent on aS //const fptype_sv mdl_sqrt__aS = constexpr_sqrt( aS ); //const fptype_sv G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); + const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); // Model couplings dependent on aS out.GC_6 = -G; out.GC_51 = -( cI * G * mdl_I51x11 ); From d5ded87a291fe7b6779d8f7cb9e1d0b6c30ec0af Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 08:08:47 +0100 Subject: [PATCH 13/96] [susy2] add reference log file in susy_gg_tt.sa CUDA_HOME=none make HRDCOD=1 -j CUDACPP_RUNTEST_DUMPEVENTS=1 ./runTest.exe --- .../dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt | 3584 +++++++++++++++++ 1 file changed, 3584 insertions(+) create mode 100644 epochX/cudacpp/susy_gg_tt.sa/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt diff --git a/epochX/cudacpp/susy_gg_tt.sa/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt b/epochX/cudacpp/susy_gg_tt.sa/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt new file mode 100644 index 0000000000..fe68e0d3cb --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt @@ -0,0 +1,3584 @@ +Event 0 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 5.849331413473452e+02 -3.138365726669761e+02 -3.490842674916366e+02 + 3 7.500000000000001e+02 -5.849331413473452e+02 3.138365726669761e+02 3.490842674916364e+02 + ME 2.005277975590333e+00 + +Event 1 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.810632950825978e+01 -7.201507372976420e+02 -2.038840274050557e+02 + 3 7.499999999999995e+02 -4.810632950825982e+01 7.201507372976420e+02 2.038840274050556e+02 + ME 1.987965040619574e+00 + +Event 2 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -6.648646621266247e+02 -9.844173672211535e+01 -3.328125681616957e+02 + 3 7.500000000000001e+02 6.648646621266247e+02 9.844173672211555e+01 3.328125681616955e+02 + ME 1.996913845057977e+00 + +Event 3 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.840443703260547e+02 2.880181894591821e+02 -6.315570585677355e+02 + 3 7.500000000000003e+02 -2.840443703260547e+02 -2.880181894591822e+02 6.315570585677355e+02 + ME 3.954303259991462e+00 + +Event 4 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.068110730975250e+02 -7.417834499166065e+02 2.913259503670260e+01 + 3 7.500000000000009e+02 -1.068110730975250e+02 7.417834499166063e+02 -2.913259503670238e+01 + ME 2.010443642097316e+00 + +Event 5 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 7.488894260747408e+02 3.183002578419756e+01 2.552404693662126e+01 + 3 7.500000000000002e+02 -7.488894260747409e+02 -3.183002578419794e+01 -2.552404693662112e+01 + ME 2.010593307151333e+00 + +Event 6 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 1.342046371107455e+02 6.997714234797991e+02 2.341133705259679e+02 + 3 7.500000000000008e+02 -1.342046371107465e+02 -6.997714234797999e+02 -2.341133705259673e+02 + ME 1.984627685022313e+00 + +Event 7 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 7.437674050373496e+02 -9.311360962031367e+01 -2.529630224920391e+01 + 3 7.499999999999999e+02 -7.437674050373496e+02 9.311360962031355e+01 2.529630224920380e+01 + ME 2.010602101284343e+00 + +Event 8 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.195549862822020e+02 4.141972083174200e+02 3.478552699778412e+02 + 3 7.500000000000000e+02 -5.195549862822019e+02 -4.141972083174200e+02 -3.478552699778412e+02 + ME 2.004552700163305e+00 + +Event 9 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -5.542195584407184e+02 -3.922814044239862e+02 -3.185215546629776e+02 + 3 7.500000000000001e+02 5.542195584407184e+02 3.922814044239862e+02 3.185215546629777e+02 + ME 1.991552846979707e+00 + +Event 10 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.763918745729663e+01 4.519082493535349e+02 -5.983040976226634e+02 + 3 7.500000000000000e+02 1.763918745729668e+01 -4.519082493535349e+02 5.983040976226634e+02 + ME 3.238239815179103e+00 + +Event 11 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.439620635476698e+02 -5.926551785630562e+02 3.895412056149584e+02 + 3 7.500000000000001e+02 -2.439620635476698e+02 5.926551785630559e+02 -3.895412056149584e+02 + ME 2.039685380250985e+00 + +Event 12 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.246322345811117e+02 -1.588382746342976e+02 -6.977182538090112e+02 + 3 7.499999999999999e+02 -2.246322345811119e+02 1.588382746342976e+02 6.977182538090112e+02 + ME 7.660154481284926e+00 + +Event 13 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 1.323144312359150e+02 1.592517083468643e+02 -7.208548984888159e+02 + 3 7.499999999999998e+02 -1.323144312359151e+02 -1.592517083468644e+02 7.208548984888159e+02 + ME 1.137910388792190e+01 + +Event 14 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -7.006663333078419e+02 -2.674479229498733e+02 -6.188527994805768e+00 + 3 7.500000000000000e+02 7.006663333078420e+02 2.674479229498733e+02 6.188527994805669e+00 + ME 2.011060943142014e+00 + +Event 15 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -4.958624135651978e+02 -4.159237215561344e+02 3.789827498187641e+02 + 3 7.499999999999994e+02 4.958624135651975e+02 4.159237215561346e+02 -3.789827498187632e+02 + ME 2.028504154582342e+00 + +Event 16 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.469644380816109e+02 -1.127460289287336e+02 6.991401142001760e+02 + 3 7.499999999999998e+02 2.469644380816109e+02 1.127460289287336e+02 -6.991401142001760e+02 + ME 7.819200457562640e+00 + +Event 17 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -7.330123448568297e+02 -1.270424905118157e+02 -9.514782126800218e+01 + 3 7.500000000000003e+02 7.330123448568297e+02 1.270424905118154e+02 9.514782126800226e+01 + ME 2.004547006017963e+00 + +Event 18 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.632088213454307e+02 -7.202207510227294e+02 -1.309387277748307e+02 + 3 7.499999999999999e+02 1.632088213454307e+02 7.202207510227292e+02 1.309387277748307e+02 + ME 1.999378789386389e+00 + +Event 19 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 -6.437046519440436e+02 3.615612227659706e+02 1.319765254034735e+02 + 3 7.499999999999990e+02 6.437046519440436e+02 -3.615612227659711e+02 -1.319765254034727e+02 + ME 1.999216047420012e+00 + +Event 20 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 3.414032790372056e+02 6.619737876650544e+01 6.645010971451344e+02 + 3 7.500000000000000e+02 -3.414032790372056e+02 -6.619737876650579e+01 -6.645010971451345e+02 + ME 5.185737809672471e+00 + +Event 21 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.873372127826304e+02 1.330214527069503e+02 -2.690220233436987e+02 + 3 7.500000000000001e+02 6.873372127826304e+02 -1.330214527069503e+02 2.690220233436988e+02 + ME 1.983618346512617e+00 + +Event 22 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.102099953078523e+02 -1.032640414901363e+02 4.236511529094595e+02 + 3 7.500000000000001e+02 -6.102099953078524e+02 1.032640414901364e+02 -4.236511529094595e+02 + ME 2.089683691123227e+00 + +Event 23 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -5.344895803220363e+02 4.175443017228515e+02 3.201212967391419e+02 + 3 7.499999999999998e+02 5.344895803220362e+02 -4.175443017228516e+02 -3.201212967391422e+02 + ME 1.992070352051261e+00 + +Event 24 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.418425161536912e+02 -5.777728680811457e+02 -4.566828369015798e+02 + 3 7.499999999999984e+02 -1.418425161536913e+02 5.777728680811457e+02 4.566828369015796e+02 + ME 2.164935885748348e+00 + +Event 25 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.716447327703473e+02 -2.655667876457525e+02 -5.948566834991209e+02 + 3 7.500000000000003e+02 3.716447327703468e+02 2.655667876457529e+02 5.948566834991209e+02 + ME 3.182266218087864e+00 + +Event 26 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 6.982684047175275e+02 2.520499043566510e+02 1.067336904966802e+02 + 3 7.499999999999982e+02 -6.982684047175262e+02 -2.520499043566514e+02 -1.067336904966819e+02 + ME 2.002985852932905e+00 + +Event 27 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.841422120170265e+02 -2.875466445556266e+02 3.722832300072826e+02 + 3 7.500000000000002e+02 5.841422120170264e+02 2.875466445556268e+02 -3.722832300072830e+02 + ME 2.022282544333713e+00 + +Event 28 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -5.475651347914464e+02 9.856189468224678e-02 -5.125157689733829e+02 + 3 7.499999999999998e+02 5.475651347914463e+02 -9.856189468233521e-02 5.125157689733828e+02 + ME 2.387846999289178e+00 + +Event 29 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -4.650224466071159e+02 8.783714288524544e+01 -5.818408377565889e+02 + 3 7.500000000000002e+02 4.650224466071161e+02 -8.783714288524529e+01 5.818408377565888e+02 + ME 2.993624880577912e+00 + +Event 30 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.782526290173227e+02 -4.370648563926671e+02 -5.828381521098888e+02 + 3 7.499999999999999e+02 -1.782526290173226e+02 4.370648563926671e+02 5.828381521098888e+02 + ME 3.006916608849429e+00 + +Event 31 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.241445908528016e+01 -6.984630617313881e+02 -2.681456243827463e+02 + 3 7.500000000000001e+02 -5.241445908528017e+01 6.984630617313881e+02 2.681456243827463e+02 + ME 1.983591630570634e+00 + +Event 32 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 4.502669185499846e+02 -4.340701284735799e+02 -4.139357747603254e+02 + 3 7.500000000000022e+02 -4.502669185499836e+02 4.340701284735788e+02 4.139357747603258e+02 + ME 2.073003851963782e+00 + +Event 33 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 3.654307823639496e+02 -1.558862065836156e+02 -6.361287871947329e+02 + 3 7.499999999999986e+02 -3.654307823639502e+02 1.558862065836151e+02 6.361287871947329e+02 + ME 4.085426045280015e+00 + +Event 34 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.482796593496792e+02 5.744589614092289e+02 4.588137359318844e+02 + 3 7.499999999999999e+02 1.482796593496794e+02 -5.744589614092289e+02 -4.588137359318844e+02 + ME 2.170948319867109e+00 + +Event 35 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -5.948173619449914e+02 -4.238736069615458e+02 1.703627636844680e+02 + 3 7.499999999999997e+02 5.948173619449915e+02 4.238736069615456e+02 -1.703627636844681e+02 + ME 1.993014678181581e+00 + +Event 36 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.826165317041466e+02 4.475925182298392e+02 -5.313086048919125e+02 + 3 7.500000000000006e+02 -2.826165317041466e+02 -4.475925182298394e+02 5.313086048919129e+02 + ME 2.503977667659708e+00 + +Event 37 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.291270169087662e+02 -1.025561575994625e+02 5.215427446678407e+02 + 3 7.499999999999999e+02 -5.291270169087661e+02 1.025561575994623e+02 -5.215427446678407e+02 + ME 2.440305740927076e+00 + +Event 38 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.783993127041845e+02 -6.682289316640183e+02 2.900754731338715e+02 + 3 7.499999999999999e+02 1.783993127041844e+02 6.682289316640183e+02 -2.900754731338715e+02 + ME 1.985304274456458e+00 + +Event 39 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999965e+02 7.035220480385169e+02 -2.585746933707227e+02 2.637908019484471e+01 + 3 7.499999999999966e+02 -7.035220480385183e+02 2.585746933707188e+02 -2.637908019484133e+01 + ME 2.010559599140099e+00 + +Event 40 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 1.045990218130194e+00 -7.391865839862932e+02 -1.268940501329283e+02 + 3 7.500000000000000e+02 -1.045990218130196e+00 7.391865839862933e+02 1.268940501329283e+02 + ME 2.000007807605100e+00 + +Event 41 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 4.516393832142919e+02 -5.982487717061383e+02 2.490531433069624e+01 + 3 7.499999999999998e+02 -4.516393832142917e+02 5.982487717061382e+02 -2.490531433069631e+01 + ME 2.010617017961215e+00 + +Event 42 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.404063782711548e+02 -2.677733538887419e+02 -6.123078707476250e+02 + 3 7.499999999999999e+02 3.404063782711548e+02 2.677733538887418e+02 6.123078707476250e+02 + ME 3.496555157380759e+00 + +Event 43 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 4.431989925840304e+02 -4.375135005185252e+02 4.179193580543111e+02 + 3 7.500000000000002e+02 -4.431989925840304e+02 4.375135005185252e+02 -4.179193580543111e+02 + ME 2.079583697162946e+00 + +Event 44 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.639601113951325e+02 7.019963091374882e+02 -5.122650643294619e+00 + 3 7.499999999999998e+02 -2.639601113951325e+02 -7.019963091374882e+02 5.122650643294828e+00 + ME 2.011070173201013e+00 + +Event 45 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.618223353367828e+02 2.920485830812342e+02 5.137389957014393e+02 + 3 7.499999999999999e+02 4.618223353367827e+02 -2.920485830812343e+02 -5.137389957014392e+02 + ME 2.394622186242569e+00 + +Event 46 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000191e+02 -7.051112468437821e+02 2.555740963349632e+02 1.041966542134262e-01 + 3 7.499999999999935e+02 7.051112468437943e+02 -2.555740963349714e+02 -1.041966542091747e-01 + ME 2.011090259550378e+00 + +Event 47 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.814786796723018e+02 -1.264994422665163e+02 -6.332234266526609e+02 + 3 7.500000000000001e+02 3.814786796723018e+02 1.264994422665164e+02 6.332234266526608e+02 + ME 4.000948864693926e+00 + +Event 48 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -5.223718392138900e+02 -9.075826002414982e+01 5.304626281216113e+02 + 3 7.500000000000000e+02 5.223718392138899e+02 9.075826002414982e+01 -5.304626281216113e+02 + ME 2.498154652615908e+00 + +Event 49 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999975e+02 2.512062929704643e+02 5.497568647487620e+02 -4.440301656798204e+02 + 3 7.500000000000001e+02 -2.512062929704639e+02 -5.497568647487620e+02 4.440301656798205e+02 + ME 2.132341642526155e+00 + +Event 50 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.081599294091975e+02 -4.985692845883419e+02 3.838412959671587e+02 + 3 7.500000000000002e+02 4.081599294091976e+02 4.985692845883420e+02 -3.838412959671588e+02 + ME 2.033431466872660e+00 + +Event 51 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999981e+02 6.013537967828100e+02 1.484120711712781e+02 4.229036157631559e+02 + 3 7.499999999999998e+02 -6.013537967828098e+02 -1.484120711712782e+02 -4.229036157631558e+02 + ME 2.088322753728032e+00 + +Event 52 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.788389248314356e+01 -6.412729991041272e+02 3.870836489834254e+02 + 3 7.500000000000001e+02 -3.788389248314355e+01 6.412729991041272e+02 -3.870836489834253e+02 + ME 2.036924470326421e+00 + +Event 53 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.721956772389075e+02 -5.973666994060347e+02 2.590818486220348e+02 + 3 7.499999999999999e+02 3.721956772389076e+02 5.973666994060347e+02 -2.590818486220348e+02 + ME 1.983491962850991e+00 + +Event 54 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.279511131417045e+02 4.100901816942154e+02 1.855365759053465e+00 + 3 7.500000000000002e+02 -6.279511131417045e+02 -4.100901816942154e+02 -1.855365759053460e+00 + ME 2.011087631467305e+00 + +Event 55 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 2.906696961973025e+02 5.783283149374524e+02 -3.788766129680948e+02 + 3 7.500000000000013e+02 -2.906696961973020e+02 -5.783283149374529e+02 3.788766129680953e+02 + ME 2.028400504671404e+00 + +Event 56 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.052921399041344e+02 -6.285366974145302e+02 -3.539728199972655e+02 + 3 7.500000000000000e+02 -2.052921399041344e+02 6.285366974145302e+02 3.539728199972655e+02 + ME 2.008326264666069e+00 + +Event 57 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.566747271383028e+02 -2.172482942051992e+01 -5.021367144037026e+02 + 3 7.500000000000002e+02 -5.566747271383028e+02 2.172482942051981e+01 5.021367144037026e+02 + ME 2.334236035577041e+00 + +Event 58 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -1.111752699385218e+02 -4.460916785360812e+02 -5.925725893888097e+02 + 3 7.500000000000000e+02 1.111752699385215e+02 4.460916785360812e+02 5.925725893888097e+02 + ME 3.146652063342941e+00 + +Event 59 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 8.715681773052760e+00 -1.696629445817669e+02 -7.305056619404637e+02 + 3 7.499999999999999e+02 -8.715681773052756e+00 1.696629445817669e+02 7.305056619404637e+02 + ME 1.416956033396763e+01 + +Event 60 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000018e+02 6.928032817602112e+02 2.624225493926031e+02 1.168675247986685e+02 + 3 7.500000000000007e+02 -6.928032817602116e+02 -2.624225493926037e+02 -1.168675247986700e+02 + ME 2.001525295764226e+00 + +Event 61 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000011e+02 -3.549971539677352e+02 2.839011090540922e+02 -5.965544241330084e+02 + 3 7.500000000000007e+02 3.549971539677362e+02 -2.839011090540924e+02 5.965544241330070e+02 + ME 3.209489797994468e+00 + +Event 62 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -3.075091268532094e+02 5.438463972512831e+02 4.149328055225047e+02 + 3 7.500000000000003e+02 3.075091268532095e+02 -5.438463972512831e+02 -4.149328055225046e+02 + ME 2.074617910784855e+00 + +Event 63 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 5.476352128383519e+02 -1.524963074254458e+02 4.892244371258423e+02 + 3 7.500000000000001e+02 -5.476352128383519e+02 1.524963074254458e+02 -4.892244371258422e+02 + ME 2.276300836068042e+00 + +Event 64 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.967977363399760e+02 1.264105709968621e+02 -6.237563017523095e+02 + 3 7.500000000000000e+02 -3.967977363399760e+02 -1.264105709968620e+02 6.237563017523095e+02 + ME 3.751976516771654e+00 + +Event 65 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.816326009293766e+02 -1.978646582155085e+02 2.423356646874607e+02 + 3 7.499999999999998e+02 -6.816326009293764e+02 1.978646582155085e+02 -2.423356646874606e+02 + ME 1.984054085169622e+00 + +Event 66 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.665640119169115e+01 -7.284851156281566e+02 -1.745443913733847e+02 + 3 7.500000000000002e+02 3.665640119169112e+01 7.284851156281566e+02 1.745443913733847e+02 + ME 1.992344720708242e+00 + +Event 67 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 7.153305678246726e+02 1.717790906820774e+02 -1.459250586433670e+02 + 3 7.499999999999997e+02 -7.153305678246727e+02 -1.717790906820775e+02 1.459250586433670e+02 + ME 1.996986995028635e+00 + +Event 68 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.487016905468892e+02 8.269239097845504e+01 -5.952552069277764e+02 + 3 7.500000000000000e+02 4.487016905468893e+02 -8.269239097845502e+01 5.952552069277764e+02 + ME 3.188598120955346e+00 + +Event 69 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000244e+02 4.919500575481476e+02 -2.881927178399743e+02 -4.872679942930075e+02 + 3 7.499999999999948e+02 -4.919500575481655e+02 2.881927178399841e+02 4.872679942930299e+02 + ME 2.268286827319417e+00 + +Event 70 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.773996828232634e+02 1.950566034399255e+02 -6.180796072185036e+02 + 3 7.500000000000000e+02 3.773996828232636e+02 -1.950566034399255e+02 6.180796072185035e+02 + ME 3.619701258963251e+00 + +Event 71 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 5.046979384687710e+02 -3.405008111295478e+02 4.379945074147525e+02 + 3 7.499999999999985e+02 -5.046979384687706e+02 3.405008111295465e+02 -4.379945074147532e+02 + ME 2.118527835260358e+00 + +Event 72 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -2.079545016377724e+02 4.542001484952560e+02 5.594257326540076e+02 + 3 7.500000000000000e+02 2.079545016377724e+02 -4.542001484952560e+02 -5.594257326540076e+02 + ME 2.737044891305738e+00 + +Event 73 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.692745262677147e+02 2.933889353680814e+02 1.688032614990298e+02 + 3 7.500000000000000e+02 6.692745262677147e+02 -2.933889353680814e+02 -1.688032614990298e+02 + ME 1.993266062385568e+00 + +Event 74 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 4.336394009948918e+02 -5.214090606974566e+00 -6.119065967645178e+02 + 3 7.500000000000001e+02 -4.336394009948917e+02 5.214090606974610e+00 6.119065967645178e+02 + ME 3.488386859223064e+00 + +Event 75 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.886320510875878e+01 -1.836844277890389e+02 -7.247724843508738e+02 + 3 7.500000000000001e+02 5.886320510875916e+01 1.836844277890389e+02 7.247724843508737e+02 + ME 1.237452317062543e+01 + +Event 76 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.688146793241962e+02 4.830467602600368e+02 3.307243925875947e+02 + 3 7.499999999999999e+02 -4.688146793241962e+02 -4.830467602600368e+02 -3.307243925875947e+02 + ME 1.996022578666890e+00 + +Event 77 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -6.339445789702437e+02 -2.102547003631318e+02 -3.411850403658548e+02 + 3 7.499999999999999e+02 6.339445789702437e+02 2.102547003631319e+02 3.411850403658547e+02 + ME 2.000891094991024e+00 + +Event 78 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.669825029292291e+02 -6.850785338298774e+02 1.479450763464257e+02 + 3 7.500000000000002e+02 -2.669825029292290e+02 6.850785338298771e+02 -1.479450763464256e+02 + ME 1.996659436127080e+00 + +Event 79 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -3.533967556727067e+02 -1.417706364842367e+02 -6.461515454681355e+02 + 3 7.499999999999999e+02 3.533967556727067e+02 1.417706364842366e+02 6.461515454681355e+02 + ME 4.411164889613540e+00 + +Event 80 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.262373668943077e+02 -6.163135238556597e+02 -3.129452628857666e+01 + 3 7.500000000000001e+02 4.262373668943077e+02 6.163135238556597e+02 3.129452628857667e+01 + ME 2.010344710722510e+00 + +Event 81 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -7.465617073786625e+02 6.074723773897961e+01 -3.814957644509608e+01 + 3 7.499999999999998e+02 7.465617073786623e+02 -6.074723773897957e+01 3.814957644509607e+01 + ME 2.009985571987386e+00 + +Event 82 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 7.475659550619685e+02 -4.211883263705301e+00 6.022792435124191e+01 + 3 7.499999999999999e+02 -7.475659550619689e+02 4.211883263705446e+00 -6.022792435124160e+01 + ME 2.008374203929284e+00 + +Event 83 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.388305538878825e+02 2.707043584000302e+02 -2.848063794273738e+02 + 3 7.500000000000001e+02 -6.388305538878825e+02 -2.707043584000302e+02 2.848063794273738e+02 + ME 1.984680018426294e+00 + +Event 84 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.782135480622982e+02 -6.945359840166777e+02 -2.199083859088391e+02 + 3 7.499999999999998e+02 1.782135480622988e+02 6.945359840166777e+02 2.199083859088391e+02 + ME 1.985988745513523e+00 + +Event 85 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.627525077696844e+02 1.216078070177454e+02 -6.450753129512397e+02 + 3 7.499999999999999e+02 3.627525077696847e+02 -1.216078070177455e+02 6.450753129512397e+02 + ME 4.373397884360151e+00 + +Event 86 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.904296033356203e+02 8.239172209485156e+01 4.550873406668076e+02 + 3 7.500000000000001e+02 5.904296033356203e+02 -8.239172209485157e+01 -4.550873406668076e+02 + ME 2.160536921334943e+00 + +Event 87 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.743433600636463e+01 -2.654341936409747e+02 6.982100680837729e+02 + 3 7.499999999999999e+02 -6.743433600636475e+01 2.654341936409747e+02 -6.982100680837726e+02 + ME 7.714450680030922e+00 + +Event 88 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.123842692071390e+02 6.372819831147648e+02 -2.424618368923954e+02 + 3 7.500000000000001e+02 3.123842692071391e+02 -6.372819831147648e+02 2.424618368923954e+02 + ME 1.984046675304770e+00 + +Event 89 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000013e+02 2.445149472352194e+02 2.427088730721040e+02 -6.661867932574363e+02 + 3 7.499999999999998e+02 -2.445149472352193e+02 -2.427088730721042e+02 6.661867932574366e+02 + ME 5.271748569268313e+00 + +Event 90 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -6.884814144916876e+01 7.440438304666905e+02 -6.448807458320981e+01 + 3 7.500000000000000e+02 6.884814144916906e+01 -7.440438304666907e+02 6.448807458320999e+01 + ME 2.007986879860101e+00 + +Event 91 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999984e+02 4.141760802881196e+02 -5.972800904748399e+02 1.849720736747397e+02 + 3 7.499999999999998e+02 -4.141760802881197e+02 5.972800904748400e+02 -1.849720736747392e+02 + ME 1.990710544351959e+00 + +Event 92 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 3.924404784338562e+01 6.957303155217187e+02 2.773431678857735e+02 + 3 7.500000000000002e+02 -3.924404784338608e+01 -6.957303155217187e+02 -2.773431678857725e+02 + ME 1.984035246923061e+00 + +Event 93 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -4.290083413418521e+02 -2.957602135933498e+02 5.394235248061855e+02 + 3 7.500000000000003e+02 4.290083413418521e+02 2.957602135933496e+02 -5.394235248061854e+02 + ME 2.563041219586656e+00 + +Event 94 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -5.430046970424431e+02 5.127268066666062e+02 6.897188351284382e+01 + 3 7.500000000000002e+02 5.430046970424431e+02 -5.127268066666062e+02 -6.897188351284385e+01 + ME 2.007553878294709e+00 + +Event 95 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.670174710353043e+02 -4.266096910567015e+02 -5.938091746221820e+02 + 3 7.500000000000003e+02 -1.670174710353046e+02 4.266096910567016e+02 5.938091746221818e+02 + ME 3.165791387662161e+00 + +Event 96 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.630781801178598e+02 4.564206587749571e+02 -5.338258642940684e+02 + 3 7.499999999999999e+02 2.630781801178598e+02 -4.564206587749570e+02 5.338258642940684e+02 + ME 2.521668423905222e+00 + +Event 97 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 3.518870680096824e+02 -5.345219674188427e+02 -3.911032571000131e+02 + 3 7.500000000000005e+02 -3.518870680096824e+02 5.345219674188430e+02 3.911032571000131e+02 + ME 2.041492391644695e+00 + +Event 98 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.752074620001102e+02 -5.701512527722717e+02 -1.077284410990111e+02 + 3 7.500000000000003e+02 -4.752074620001106e+02 5.701512527722713e+02 1.077284410990113e+02 + ME 2.002846157195692e+00 + +Event 99 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.725046135512981e+02 -2.442507613156968e+02 6.033919774737651e+02 + 3 7.500000000000001e+02 3.725046135512981e+02 2.442507613156967e+02 -6.033919774737650e+02 + ME 3.326050357956303e+00 + +Event 100 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -6.744099165829510e+02 -2.914668115726679e+02 1.507261164040269e+02 + 3 7.500000000000005e+02 6.744099165829509e+02 2.914668115726684e+02 -1.507261164040272e+02 + ME 1.996207314305127e+00 + +Event 101 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -1.607743247254612e+01 -6.958012347235277e+02 2.794676330495263e+02 + 3 7.500000000000001e+02 1.607743247254616e+01 6.958012347235277e+02 -2.794676330495263e+02 + ME 1.984191496570907e+00 + +Event 102 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 6.752700687038076e+02 -3.061796867633290e+02 -1.129793508844210e+02 + 3 7.500000000000001e+02 -6.752700687038074e+02 3.061796867633290e+02 1.129793508844210e+02 + ME 2.002095206274886e+00 + +Event 103 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.709405901562356e+02 -3.521101236710536e+02 6.042430449861606e+02 + 3 7.500000000000014e+02 -2.709405901562357e+02 3.521101236710547e+02 -6.042430449861606e+02 + ME 3.341378826640739e+00 + +Event 104 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000019e+02 -5.896850660222877e+02 -3.933704689676251e+02 2.450126467235561e+02 + 3 7.499999999999995e+02 5.896850660222864e+02 3.933704689676245e+02 -2.450126467235561e+02 + ME 1.983906451475833e+00 + +Event 105 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.232008513548739e+02 2.751892248186205e+02 -6.183139172194097e+02 + 3 7.499999999999999e+02 3.232008513548740e+02 -2.751892248186206e+02 6.183139172194099e+02 + ME 3.624933423198033e+00 + +Event 106 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000022e+02 -6.293500647395228e+02 3.467904123852736e+02 -2.148369285993608e+02 + 3 7.499999999999977e+02 6.293500647395228e+02 -3.467904123852738e+02 2.148369285993607e+02 + ME 1.986570256908784e+00 + +Event 107 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -3.164803773315051e+02 6.485956071491387e+02 -2.041173906137060e+02 + 3 7.499999999999999e+02 3.164803773315048e+02 -6.485956071491385e+02 2.041173906137067e+02 + ME 1.987933572364906e+00 + +Event 108 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 1.270898770881453e+02 1.543847712987803e+02 -7.228509566520220e+02 + 3 7.499999999999999e+02 -1.270898770881454e+02 -1.543847712987803e+02 7.228509566520220e+02 + ME 1.186640788044667e+01 + +Event 109 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.270964825883011e+02 4.495607724123360e+02 2.873402337840359e+02 + 3 7.499999999999999e+02 -5.270964825883012e+02 -4.495607724123360e+02 -2.873402337840358e+02 + ME 1.984961998572323e+00 + +Event 110 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -7.153664733993946e+02 -2.099256997662553e+02 -8.174355824015288e+01 + 3 7.500000000000001e+02 7.153664733993946e+02 2.099256997662553e+02 8.174355824015289e+01 + ME 2.006184439353365e+00 + +Event 111 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999986e+02 -4.856864011705250e+02 4.693412062195835e+02 -3.260790576875004e+02 + 3 7.499999999999993e+02 4.856864011705258e+02 -4.693412062195836e+02 3.260790576875023e+02 + ME 1.994176128857618e+00 + +Event 112 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.064700140846772e+02 5.272494748911355e+02 1.673203994140850e+02 + 3 7.500000000000005e+02 5.064700140846775e+02 -5.272494748911357e+02 -1.673203994140850e+02 + ME 1.993505729951682e+00 + +Event 113 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 5.589223488778573e+02 3.321278479133653e+02 3.738942371383856e+02 + 3 7.500000000000001e+02 -5.589223488778573e+02 -3.321278479133653e+02 -3.738942371383856e+02 + ME 2.023720122294617e+00 + +Event 114 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -4.977003556727846e+01 -1.428011991695803e+02 7.345956446068556e+02 + 3 7.500000000000001e+02 4.977003556727855e+01 1.428011991695803e+02 -7.345956446068556e+02 + ME 1.578342838955713e+01 + +Event 115 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.329003126925221e+02 7.085629881653696e+02 7.871426903610248e+01 + 3 7.499999999999999e+02 -2.329003126925221e+02 -7.085629881653693e+02 -7.871426903610259e+01 + ME 2.006526871110616e+00 + +Event 116 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -1.524145310251806e+01 9.339299182317880e+01 7.440063488879913e+02 + 3 7.500000000000000e+02 1.524145310251808e+01 -9.339299182317883e+01 -7.440063488879913e+02 + ME 2.124419017838906e+01 + +Event 117 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 8.625602426427481e+01 6.794638862165408e+02 -3.055956897694005e+02 + 3 7.500000000000019e+02 -8.625602426427798e+01 -6.794638862165372e+02 3.055956897693977e+02 + ME 1.988059503765081e+00 + +Event 118 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -4.994800131986639e+02 -5.547627166697749e+02 7.251237555226915e+01 + 3 7.499999999999997e+02 4.994800131986638e+02 5.547627166697752e+02 -7.251237555226918e+01 + ME 2.007194060980918e+00 + +Event 119 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.696453710916101e+02 -6.412256459582578e+02 -3.500715912961300e+02 + 3 7.499999999999997e+02 1.696453710916101e+02 6.412256459582578e+02 3.500715912961301e+02 + ME 2.005872415478047e+00 + +Event 120 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.909680497736684e+02 -4.472542030924528e+02 -1.149801807392985e+02 + 3 7.499999999999998e+02 -5.909680497736686e+02 4.472542030924529e+02 1.149801807392985e+02 + ME 2.001803340914013e+00 + +Event 121 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 4.403248055966552e+02 -1.878335448594353e+02 5.773496557562032e+02 + 3 7.500000000000001e+02 -4.403248055966552e+02 1.878335448594353e+02 -5.773496557562032e+02 + ME 2.935937839381182e+00 + +Event 122 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -7.911542100677308e+01 4.343577103010796e+02 6.062789206800156e+02 + 3 7.499999999999997e+02 7.911542100677336e+01 -4.343577103010785e+02 -6.062789206800155e+02 + ME 3.378827638495415e+00 + +Event 123 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.897831726795230e+02 -1.483823971281492e+02 5.482655451352789e+02 + 3 7.499999999999997e+02 -4.897831726795229e+02 1.483823971281491e+02 -5.482655451352789e+02 + ME 2.634572486292920e+00 + +Event 124 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.072168527361445e+02 3.676168647522196e+02 -6.200153358521670e+02 + 3 7.500000000000000e+02 2.072168527361444e+02 -3.676168647522194e+02 6.200153358521670e+02 + ME 3.663499416923802e+00 + +Event 125 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.485400265882382e+02 -5.747265047667001e+02 -4.583942683103900e+02 + 3 7.500000000000002e+02 1.485400265882382e+02 5.747265047667001e+02 4.583942683103899e+02 + ME 2.169752204214021e+00 + +Event 126 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 1.787754338344337e+02 4.314693606084867e+02 5.868334815886135e+02 + 3 7.500000000000003e+02 -1.787754338344338e+02 -4.314693606084865e+02 -5.868334815886134e+02 + ME 3.062012798921335e+00 + +Event 127 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -1.824698245714143e+02 -6.992613640416623e+02 -2.005948849783361e+02 + 3 7.500000000000002e+02 1.824698245714143e+02 6.992613640416619e+02 2.005948849783361e+02 + ME 1.988415763442605e+00 + +Event 128 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.808356150498172e+02 5.501654866182587e+02 -3.387951765355025e+02 + 3 7.500000000000001e+02 -3.808356150498172e+02 -5.501654866182587e+02 3.387951765355026e+02 + ME 1.999687906503314e+00 + +Event 129 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.909177663532490e+02 -7.106409991592089e+00 4.618069860289238e+02 + 3 7.500000000000000e+02 -5.909177663532489e+02 7.106409991592302e+00 -4.618069860289237e+02 + ME 2.179665903384003e+00 + +Event 130 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999984e+02 -6.434408032869533e+02 1.129979432354512e+02 3.683957077518649e+02 + 3 7.499999999999989e+02 6.434408032869521e+02 -1.129979432354518e+02 -3.683957077518653e+02 + ME 2.018960418239853e+00 + +Event 131 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 6.746515082539579e+02 1.884449229635104e+02 2.680183826157030e+02 + 3 7.500000000000000e+02 -6.746515082539579e+02 -1.884449229635102e+02 -2.680183826157021e+02 + ME 1.983588012685104e+00 + +Event 132 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.558298868453927e+02 9.921998690776338e+01 -6.980017644638849e+02 + 3 7.499999999999997e+02 2.558298868453928e+02 -9.921998690776336e+01 6.980017644638850e+02 + ME 7.691362840915397e+00 + +Event 133 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 3.135802854217638e+02 -4.216148118485033e+02 5.351713324018350e+02 + 3 7.499999999999995e+02 -3.135802854217635e+02 4.216148118485038e+02 -5.351713324018348e+02 + ME 2.531352293421454e+00 + +Event 134 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.181187851375606e+02 2.100869264433576e+02 7.102192872171314e+02 + 3 7.499999999999999e+02 1.181187851375606e+02 -2.100869264433576e+02 -7.102192872171313e+02 + ME 9.317293972715463e+00 + +Event 135 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.827401847882261e+02 -7.210896591418211e+02 -9.558100408709542e+01 + 3 7.500000000000001e+02 1.827401847882261e+02 7.210896591418210e+02 9.558100408709542e+01 + ME 2.004490941604272e+00 + +Event 136 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999965e+02 1.347097060534662e+02 3.165403007658666e+02 6.664499479226027e+02 + 3 7.500000000000008e+02 -1.347097060534673e+02 -3.165403007658664e+02 -6.664499479226030e+02 + ME 5.285444697457056e+00 + +Event 137 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.709304925774791e+02 -4.143354602318728e+02 -2.546850977966301e+02 + 3 7.500000000000000e+02 -5.709304925774790e+02 4.143354602318728e+02 2.546850977966302e+02 + ME 1.983552010334113e+00 + +Event 138 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.230024938116915e+02 -6.078146322618757e+01 7.134952708195409e+02 + 3 7.500000000000000e+02 2.230024938116915e+02 6.078146322618757e+01 -7.134952708195409e+02 + ME 9.871519249046608e+00 + +Event 139 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.938095139105243e+02 -4.125095274968337e+02 3.853414767395157e+02 + 3 7.499999999999999e+02 4.938095139105243e+02 4.125095274968338e+02 -3.853414767395157e+02 + ME 2.035026781271170e+00 + +Event 140 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.150639064959597e+02 7.154069121581596e+02 -6.663682284611146e+01 + 3 7.500000000000010e+02 -2.150639064959609e+02 -7.154069121581591e+02 6.663682284611136e+01 + ME 2.007782577771487e+00 + +Event 141 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.001994340787286e+02 -3.935615219448572e+02 2.176464283557658e+02 + 3 7.500000000000001e+02 -6.001994340787286e+02 3.935615219448572e+02 -2.176464283557658e+02 + ME 1.986242560158687e+00 + +Event 142 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 3.402076114496817e+02 6.461448067635170e+02 -1.710428887859415e+02 + 3 7.500000000000002e+02 -3.402076114496816e+02 -6.461448067635171e+02 1.710428887859415e+02 + ME 1.992905283508286e+00 + +Event 143 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 -2.483174680463646e+02 4.871846387911280e+02 -5.133123442788026e+02 + 3 7.499999999999984e+02 2.483174680463649e+02 -4.871846387911279e+02 5.133123442788022e+02 + ME 2.392247598308911e+00 + +Event 144 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -2.342206172984319e+01 7.464718320769290e+02 -6.878379852869921e+01 + 3 7.500000000000003e+02 2.342206172984328e+01 -7.464718320769291e+02 6.878379852869914e+01 + ME 2.007572555109217e+00 + +Event 145 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999972e+02 6.139969952391268e+02 -3.976063262667790e+02 -1.655804914534163e+02 + 3 7.500000000000030e+02 -6.139969952391284e+02 3.976063262667795e+02 1.655804914534169e+02 + ME 1.993787627301450e+00 + +Event 146 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -7.489741806317043e+02 -1.872912227594817e+01 3.445136754673255e+01 + 3 7.500000000000001e+02 7.489741806317041e+02 1.872912227594822e+01 -3.445136754673256e+01 + ME 2.010187867275871e+00 + +Event 147 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.413828145683407e+02 3.157741422579704e+02 -6.654303801495521e+02 + 3 7.500000000000000e+02 1.413828145683408e+02 -3.157741422579705e+02 6.654303801495525e+02 + ME 5.232788324013259e+00 + +Event 148 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -6.671561067841420e+02 2.284674470436643e+02 2.553533920314686e+02 + 3 7.500000000000001e+02 6.671561067841420e+02 -2.284674470436643e+02 -2.553533920314688e+02 + ME 1.983538597988968e+00 + +Event 149 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 3.495458719419115e+02 -1.546896579009742e+02 -6.452819485673282e+02 + 3 7.499999999999998e+02 -3.495458719419115e+02 1.546896579009742e+02 6.452819485673284e+02 + ME 4.380593272658192e+00 + +Event 150 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.603068721757451e+01 7.476532040007083e+02 1.937133545739223e+01 + 3 7.499999999999994e+02 5.603068721757452e+01 -7.476532040007069e+02 -1.937133545739284e+01 + ME 2.010803533079019e+00 + +Event 151 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 -2.349239539523203e+01 2.915649777816018e+02 -6.906069584718988e+02 + 3 7.499999999999984e+02 2.349239539523064e+01 -2.915649777816013e+02 6.906069584718991e+02 + ME 6.951008749064900e+00 + +Event 152 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 1.325883219266814e+02 7.097564553439875e+02 2.028943345345159e+02 + 3 7.500000000000000e+02 -1.325883219266814e+02 -7.097564553439880e+02 -2.028943345345148e+02 + ME 1.988099263967034e+00 + +Event 153 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.538006601590899e+02 -6.984195271964692e+02 -1.014661960136019e+02 + 3 7.500000000000003e+02 2.538006601590900e+02 6.984195271964692e+02 1.014661960136019e+02 + ME 2.003711111593866e+00 + +Event 154 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.323172710781250e+02 -6.860984228486728e+02 2.725088878165297e+02 + 3 7.500000000000000e+02 1.323172710781251e+02 6.860984228486726e+02 -2.725088878165297e+02 + ME 1.983756374901854e+00 + +Event 155 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.452704712456864e+02 -4.996070210616754e+02 1.247915770129769e+02 + 3 7.500000000000001e+02 5.452704712456864e+02 4.996070210616755e+02 -1.247915770129769e+02 + ME 2.000331222506823e+00 + +Event 156 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000016e+02 -1.756428583566509e+02 5.398397609473058e+02 -4.901251052625874e+02 + 3 7.500000000000000e+02 1.756428583566503e+02 -5.398397609473059e+02 4.901251052625868e+02 + ME 2.280054547244574e+00 + +Event 157 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 -7.343271291233364e+02 5.984001257480936e+01 1.402955463602692e+02 + 3 7.500000000000000e+02 7.343271291233360e+02 -5.984001257480933e+01 -1.402955463602694e+02 + ME 1.997894644163825e+00 + +Event 158 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999990e+02 5.956876772329629e+02 -3.688982317556030e+02 -2.675262338545276e+02 + 3 7.499999999999999e+02 -5.956876772329626e+02 3.688982317556029e+02 2.675262338545273e+02 + ME 1.983574637868442e+00 + +Event 159 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 4.610896376803973e+02 -1.089359794187623e+02 5.814028710041215e+02 + 3 7.499999999999999e+02 -4.610896376803973e+02 1.089359794187621e+02 -5.814028710041215e+02 + ME 2.987844404961483e+00 + +Event 160 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 7.080441809468048e+02 2.091039913840823e+02 -1.320945063756017e+02 + 3 7.500000000000002e+02 -7.080441809468050e+02 -2.091039913840823e+02 1.320945063756016e+02 + ME 1.999197513465153e+00 + +Event 161 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.069769124239758e+02 3.104285773922232e+02 3.125909885497298e+02 + 3 7.499999999999997e+02 -6.069769124239758e+02 -3.104285773922232e+02 -3.125909885497298e+02 + ME 1.989802440637427e+00 + +Event 162 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.828417389515272e+01 3.828517565470377e+02 6.412969702909132e+02 + 3 7.500000000000001e+02 -6.828417389515273e+01 -3.828517565470376e+02 -6.412969702909131e+02 + ME 4.246334187981941e+00 + +Event 163 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 2.807789436749608e+02 6.954089716815359e+02 -8.339478017354233e+00 + 3 7.499999999999999e+02 -2.807789436749607e+02 -6.954089716815359e+02 8.339478017354306e+00 + ME 2.011037025935879e+00 + +Event 164 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 3.142343236884305e+02 3.091601587257571e+02 -6.067757296338901e+02 + 3 7.500000000000000e+02 -3.142343236884303e+02 -3.091601587257572e+02 6.067757296338900e+02 + ME 3.388136839521116e+00 + +Event 165 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.065756695626629e+02 4.857807362514072e+02 2.643826153403072e+02 + 3 7.499999999999993e+02 5.065756695626631e+02 -4.857807362514072e+02 -2.643826153403074e+02 + ME 1.983511995791894e+00 + +Event 166 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.582369138095781e+02 -7.322863531814778e+02 3.489664815125224e+01 + 3 7.500000000000001e+02 -1.582369138095780e+02 7.322863531814778e+02 -3.489664815125219e+01 + ME 2.010164566736145e+00 + +Event 167 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 4.506018848986033e+02 -1.448154652770772e+02 -5.817958596813918e+02 + 3 7.499999999999998e+02 -4.506018848986031e+02 1.448154652770771e+02 5.817958596813919e+02 + ME 2.993029659647449e+00 + +Event 168 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.383838657580175e+02 4.266457451013773e+02 -6.011017500263644e+02 + 3 7.500000000000000e+02 -1.383838657580178e+02 -4.266457451013773e+02 6.011017500263642e+02 + ME 3.285727681812074e+00 + +Event 169 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 4.787059935025895e+02 5.010202352445186e+02 -2.869133940063202e+02 + 3 7.499999999999999e+02 -4.787059935025894e+02 -5.010202352445185e+02 2.869133940063202e+02 + ME 1.984912165231123e+00 + +Event 170 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.735853015668399e+02 3.083708463739302e+01 6.976389875699663e+02 + 3 7.499999999999999e+02 2.735853015668399e+02 -3.083708463739308e+01 -6.976389875699662e+02 + ME 7.651473042207752e+00 + +Event 171 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.071409269801590e+01 6.637658831421408e+02 -3.438439345464608e+02 + 3 7.499999999999998e+02 -6.071409269801586e+01 -6.637658831421406e+02 3.438439345464608e+02 + ME 2.002296135796218e+00 + +Event 172 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -7.459627102375574e+02 -5.960011581968069e+01 4.987445367439287e+01 + 3 7.500000000000014e+02 7.459627102375568e+02 5.960011581968120e+01 -4.987445367439179e+01 + ME 2.009214312595575e+00 + +Event 173 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.227626126251469e+02 -8.611217040357144e+01 5.308520932893802e+02 + 3 7.500000000000003e+02 5.227626126251467e+02 8.611217040357188e+01 -5.308520932893800e+02 + ME 2.500827870542702e+00 + +Event 174 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 7.427899788159027e+02 8.606171690349179e+01 -5.793468955896293e+01 + 3 7.500000000000008e+02 -7.427899788159040e+02 -8.606171690349203e+01 5.793468955896337e+01 + ME 2.008572783324111e+00 + +Event 175 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 1.880595469134288e+02 1.952277576903689e+02 6.992994562002085e+02 + 3 7.500000000000005e+02 -1.880595469134287e+02 -1.952277576903685e+02 -6.992994562002081e+02 + ME 7.837424879675873e+00 + +Event 176 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.956039433245556e+02 6.710800379532824e+02 -1.573845333938319e+02 + 3 7.500000000000003e+02 2.956039433245558e+02 -6.710800379532825e+02 1.573845333938319e+02 + ME 1.995121848065215e+00 + +Event 177 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.644810965455426e+02 -6.588907599528918e+02 2.416872276699125e+02 + 3 7.500000000000003e+02 -2.644810965455426e+02 6.588907599528917e+02 -2.416872276699127e+02 + ME 1.984092858745212e+00 + +Event 178 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.046483864607852e+02 3.100401775516634e+02 3.174514404662581e+02 + 3 7.500000000000002e+02 6.046483864607852e+02 -3.100401775516634e+02 -3.174514404662581e+02 + ME 1.991217628237894e+00 + +Event 179 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999952e+02 -7.554400969691652e+01 -5.342383817568527e+02 -5.209438108440612e+02 + 3 7.499999999999992e+02 7.554400969691555e+01 5.342383817568511e+02 5.209438108440604e+02 + ME 2.436643331539693e+00 + +Event 180 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.109280079817108e+02 7.886792704189078e+01 5.433529430710355e+02 + 3 7.500000000000002e+02 -5.109280079817111e+02 -7.886792704189065e+01 -5.433529430710350e+02 + ME 2.593854265630499e+00 + +Event 181 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -2.142616447442584e+02 -7.154366257261994e+02 -6.886495597177299e+01 + 3 7.500000000000006e+02 2.142616447442587e+02 7.154366257261991e+02 6.886495597177280e+01 + ME 2.007564501650115e+00 + +Event 182 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.851592186609105e+02 1.986891791084640e+02 2.314723637697936e+02 + 3 7.500000000000001e+02 -6.851592186609104e+02 -1.986891791084641e+02 -2.314723637697936e+02 + ME 1.984847553137794e+00 + +Event 183 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000302e+02 -4.900054502464825e+02 -4.951952606757475e+02 -2.778062499891797e+02 + 3 7.500000000000072e+02 4.900054502465236e+02 4.951952606757305e+02 2.778062499891464e+02 + ME 1.984067510574099e+00 + +Event 184 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -4.678382693716270e+02 5.778877958512080e+02 -9.835166047197205e+01 + 3 7.500000000000005e+02 4.678382693716272e+02 -5.778877958512080e+02 9.835166047197220e+01 + ME 2.004127965245699e+00 + +Event 185 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -1.061006737744816e+02 -6.192100761952033e+02 4.096602599263589e+02 + 3 7.499999999999989e+02 1.061006737744816e+02 6.192100761952032e+02 -4.096602599263590e+02 + ME 2.066323292737944e+00 + +Event 186 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.455169979042824e+02 -6.058703167657474e+02 3.676174138996159e+02 + 3 7.500000000000002e+02 -2.455169979042824e+02 6.058703167657475e+02 -3.676174138996160e+02 + ME 2.018319718790237e+00 + +Event 187 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000010e+02 -7.262021511245958e+01 -7.131602101967057e+02 -2.205194298677789e+02 + 3 7.500000000000001e+02 7.262021511245966e+01 7.131602101967057e+02 2.205194298677793e+02 + ME 1.985921774689299e+00 + +Event 188 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000017e+02 -7.363140077196471e+02 -6.013110168472285e+01 1.293287773313052e+02 + 3 7.499999999999994e+02 7.363140077196483e+02 6.013110168472299e+01 -1.293287773313051e+02 + ME 1.999630194484571e+00 + +Event 189 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 4.206549846649757e+02 6.188260461572106e+02 -5.102654675699417e+01 + 3 7.499999999999998e+02 -4.206549846649756e+02 -6.188260461572107e+02 5.102654675699394e+01 + ME 2.009128074801350e+00 + +Event 190 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.604864850049361e+00 4.279484687014978e+01 -7.487570945048690e+02 + 3 7.500000000000005e+02 5.604864850049498e+00 -4.279484687014975e+01 7.487570945048689e+02 + ME 2.560979109157362e+01 + +Event 191 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 3.197160362814988e+01 2.941727699376931e+02 -6.891590512999376e+02 + 3 7.500000000000002e+02 -3.197160362814988e+01 -2.941727699376931e+02 6.891590512999373e+02 + ME 6.822140982496753e+00 + +Event 192 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.806169100853666e+02 -1.212419534166893e+02 -4.590036954694657e+02 + 3 7.499999999999997e+02 -5.806169100853666e+02 1.212419534166893e+02 4.590036954694659e+02 + ME 2.171492035497775e+00 + +Event 193 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -6.456318764132128e+02 1.898225735585407e+02 3.310994876570574e+02 + 3 7.499999999999995e+02 6.456318764132126e+02 -1.898225735585411e+02 -3.310994876570574e+02 + ME 1.996179823211170e+00 + +Event 194 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 6.338176939226099e+02 -1.425870083131823e+02 -3.747586902673792e+02 + 3 7.500000000000002e+02 -6.338176939226099e+02 1.425870083131823e+02 3.747586902673791e+02 + ME 2.024506557702449e+00 + +Event 195 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.241223574217288e+01 6.578529508306934e+02 -3.576740098786882e+02 + 3 7.500000000000006e+02 -4.241223574217268e+01 -6.578529508306935e+02 3.576740098786880e+02 + ME 2.010814173675303e+00 + +Event 196 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.866306133186302e+02 -4.882418460209895e+01 6.913458544798877e+02 + 3 7.499999999999999e+02 -2.866306133186302e+02 4.882418460209892e+01 -6.913458544798877e+02 + ME 7.018637302241416e+00 + +Event 197 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -4.280026621922506e+02 -1.673514311633879e+02 -5.927117508906940e+02 + 3 7.500000000000000e+02 4.280026621922506e+02 1.673514311633879e+02 5.927117508906940e+02 + ME 3.148789429367749e+00 + +Event 198 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -1.361491583779542e+02 5.918825836234865e+02 4.400436499668660e+02 + 3 7.500000000000000e+02 1.361491583779541e+02 -5.918825836234865e+02 -4.400436499668658e+02 + ME 2.123099646243896e+00 + +Event 199 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.634973495764100e+02 2.221316746677182e+01 -3.489668211967611e+02 + 3 7.499999999999998e+02 6.634973495764100e+02 -2.221316746677164e+01 3.489668211967611e+02 + ME 2.005207966091981e+00 + +Event 200 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -8.393568923823409e+01 7.316416839490794e+02 1.419691740493622e+02 + 3 7.500000000000000e+02 8.393568923823405e+01 -7.316416839490794e+02 -1.419691740493622e+02 + ME 1.997625733174078e+00 + +Event 201 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.430360425977639e+02 -7.001150406129030e+02 -1.152059542992963e+02 + 3 7.499999999999997e+02 -2.430360425977634e+02 7.001150406129030e+02 1.152059542992962e+02 + ME 2.001770217517976e+00 + +Event 202 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.592948659363004e+02 -1.261773212911235e+02 3.345288677256897e+02 + 3 7.499999999999999e+02 6.592948659363005e+02 1.261773212911236e+02 -3.345288677256896e+02 + ME 1.997675744803344e+00 + +Event 203 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 6.830488054426544e+02 -6.443348473770897e+01 -3.029730275584196e+02 + 3 7.499999999999976e+02 -6.830488054426536e+02 6.443348473771104e+01 3.029730275584206e+02 + ME 1.987490455239470e+00 + +Event 204 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.581562204420950e+02 4.859410156234014e+02 5.489516729825758e+02 + 3 7.500000000000001e+02 -1.581562204420950e+02 -4.859410156234014e+02 -5.489516729825760e+02 + ME 2.640462426858579e+00 + +Event 205 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000018e+02 4.522293593956092e+02 5.926970499435945e+02 8.184627962711173e+01 + 3 7.499999999999994e+02 -4.522293593956091e+02 -5.926970499435945e+02 -8.184627962711099e+01 + ME 2.006172642011354e+00 + +Event 206 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 5.347787837401266e+02 -3.496041176300519e+02 3.927958927961735e+02 + 3 7.499999999999994e+02 -5.347787837401273e+02 3.496041176300523e+02 -3.927958927961737e+02 + ME 2.043497155295106e+00 + +Event 207 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 3.012154446331053e+02 6.385938504619652e+02 -2.529172790986054e+02 + 3 7.500000000000000e+02 -3.012154446331053e+02 -6.385938504619652e+02 2.529172790986054e+02 + ME 1.983594671806983e+00 + +Event 208 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 7.361476225156937e+02 -7.555700020645612e+01 1.219746595990005e+02 + 3 7.499999999999993e+02 -7.361476225156937e+02 7.555700020645673e+01 -1.219746595989994e+02 + ME 2.000760387842997e+00 + +Event 209 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.682577368640646e+02 -6.981157757233002e+02 5.632184566245705e+01 + 3 7.500000000000002e+02 -2.682577368640645e+02 6.981157757233002e+02 -5.632184566245736e+01 + ME 2.008708228725182e+00 + +Event 210 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -9.656834699970364e+01 -3.286793908889076e+01 7.430304522277629e+02 + 3 7.500000000000000e+02 9.656834699970366e+01 3.286793908889072e+01 -7.430304522277628e+02 + ME 2.051796274876418e+01 + +Event 211 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.319690574526541e+02 -6.682701567195196e+02 2.492094581323018e+02 + 3 7.499999999999999e+02 2.319690574526541e+02 6.682701567195196e+02 -2.492094581323018e+02 + ME 1.983716976680244e+00 + +Event 212 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.134259033134994e+02 3.540686678370770e+02 2.466658460348640e+02 + 3 7.500000000000000e+02 6.134259033134992e+02 -3.540686678370769e+02 -2.466658460348641e+02 + ME 1.983825553445415e+00 + +Event 213 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -5.285016171336246e+02 5.172862825223079e+02 -1.249037333364969e+02 + 3 7.499999999999999e+02 5.285016171336246e+02 -5.172862825223079e+02 1.249037333364970e+02 + ME 2.000314035056867e+00 + +Event 214 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 3.200786657490620e+02 -3.467878641431868e+02 -5.829132225428646e+02 + 3 7.499999999999999e+02 -3.200786657490620e+02 3.467878641431867e+02 5.829132225428646e+02 + ME 3.007924411782479e+00 + +Event 215 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999953e+02 -6.774094555834020e+02 -2.472477212256584e+02 2.061188827713712e+02 + 3 7.499999999999992e+02 6.774094555834032e+02 2.472477212256532e+02 -2.061188827713711e+02 + ME 1.987666589466212e+00 + +Event 216 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000011e+02 -2.582427834329911e+02 2.233184861234906e+02 6.677870308416391e+02 + 3 7.499999999999992e+02 2.582427834329910e+02 -2.233184861234906e+02 -6.677870308416393e+02 + ME 5.356191869677452e+00 + +Event 217 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 4.526164805502724e+02 -1.109167036758217e+02 -5.876527940714437e+02 + 3 7.499999999999997e+02 -4.526164805502727e+02 1.109167036758218e+02 5.876527940714441e+02 + ME 3.073688651172201e+00 + +Event 218 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999842e+02 -2.774671855265902e+02 -6.964182224530531e+02 2.266319463985957e+01 + 3 7.499999999999866e+02 2.774671855265876e+02 6.964182224530413e+02 -2.266319463988859e+01 + ME 2.010698134374191e+00 + +Event 219 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -5.962071114522208e+02 4.057112512865971e+02 -2.059986913387149e+02 + 3 7.500000000000006e+02 5.962071114522209e+02 -4.057112512865967e+02 2.059986913387148e+02 + ME 1.987682472208512e+00 + +Event 220 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.732003838284643e+02 1.672407953016318e+02 4.538412017058155e+02 + 3 7.499999999999999e+02 -5.732003838284643e+02 -1.672407953016320e+02 -4.538412017058155e+02 + ME 2.157161276750328e+00 + +Event 221 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.373739041416514e+01 6.118806829877553e+02 -4.330572318790715e+02 + 3 7.500000000000001e+02 -2.373739041416518e+01 -6.118806829877553e+02 4.330572318790716e+02 + ME 2.107991304156184e+00 + +Event 222 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.926156270671631e+02 -4.951619530302006e+02 4.813426379071443e+02 + 3 7.500000000000000e+02 2.926156270671630e+02 4.951619530302006e+02 -4.813426379071443e+02 + ME 2.245144056797008e+00 + +Event 223 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -1.745329870696012e+02 3.653875131398770e+01 -7.284937584337814e+02 + 3 7.499999999999995e+02 1.745329870696012e+02 -3.653875131398771e+01 7.284937584337814e+02 + ME 1.348601976294947e+01 + +Event 224 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -5.087597823083426e+02 -4.620767648884738e+02 -3.002474766851025e+02 + 3 7.499999999999997e+02 5.087597823083424e+02 4.620767648884733e+02 3.002474766851024e+02 + ME 1.986945509771933e+00 + +Event 225 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000013e+02 -2.331337537494541e+02 -7.096534890457165e+02 -6.738381369388374e+01 + 3 7.499999999999987e+02 2.331337537494539e+02 7.096534890457161e+02 6.738381369388425e+01 + ME 2.007710169037881e+00 + +Event 226 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 2.653645699768242e+02 -5.593089569178837e+02 4.233853277009536e+02 + 3 7.499999999999998e+02 -2.653645699768244e+02 5.593089569178835e+02 -4.233853277009537e+02 + ME 2.089198208343560e+00 + +Event 227 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -1.520233915006850e+02 6.293027014655148e+02 -3.786383477209280e+02 + 3 7.499999999999999e+02 1.520233915006851e+02 -6.293027014655148e+02 3.786383477209282e+02 + ME 2.028168432094755e+00 + +Event 228 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -3.734089345660776e+00 1.222238163786131e+02 7.399644554210732e+02 + 3 7.499999999999997e+02 3.734089345661521e+00 -1.222238163786135e+02 -7.399644554210735e+02 + ME 1.851576164018946e+01 + +Event 229 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -6.305938355568204e+02 2.251166185777011e+02 -3.378963193020605e+02 + 3 7.500000000000007e+02 6.305938355568201e+02 -2.251166185777020e+02 3.378963193020607e+02 + ME 1.999249677967361e+00 + +Event 230 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000066e+02 1.722967433849862e+02 6.011245480280832e+02 -4.140810427645376e+02 + 3 7.500000000000027e+02 -1.722967433849865e+02 -6.011245480280835e+02 4.140810427645375e+02 + ME 2.073237677970927e+00 + +Event 231 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -2.986233395749247e+02 5.694313413368087e+02 -3.860984959365696e+02 + 3 7.499999999999993e+02 2.986233395749253e+02 -5.694313413368089e+02 3.860984959365697e+02 + ME 2.035845382635010e+00 + +Event 232 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999983e+02 1.772789865710537e+02 -7.124844710497254e+02 -1.530948706957047e+02 + 3 7.500000000000007e+02 -1.772789865710537e+02 7.124844710497252e+02 1.530948706957046e+02 + ME 1.995821457072535e+00 + +Event 233 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 3.448110289802655e+02 -6.646149940143566e+02 -4.350016120507395e+01 + 3 7.500000000000002e+02 -3.448110289802656e+02 6.646149940143566e+02 4.350016120507374e+01 + ME 2.009657867600829e+00 + +Event 234 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.161943612943173e+01 -7.278723602156027e+02 1.733126107582698e+02 + 3 7.499999999999995e+02 -5.161943612943171e+01 7.278723602156031e+02 -1.733126107582698e+02 + ME 1.992541381337467e+00 + +Event 235 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 2.147886171723103e+02 -3.086282984760169e+02 6.489332965051219e+02 + 3 7.499999999999995e+02 -2.147886171723102e+02 3.086282984760170e+02 -6.489332965051220e+02 + ME 4.512216218351590e+00 + +Event 236 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.457991029234601e+02 -2.210062269052574e+02 6.732303087882513e+02 + 3 7.500000000000000e+02 2.457991029234601e+02 2.210062269052574e+02 -6.732303087882513e+02 + ME 5.665591081380857e+00 + +Event 237 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.076996593814686e+02 -4.026323435046959e+02 -5.977023067186025e+02 + 3 7.499999999999999e+02 -2.076996593814686e+02 4.026323435046959e+02 5.977023067186025e+02 + ME 3.228270627806512e+00 + +Event 238 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.969032777715727e+02 -4.416010667956443e+02 -1.058063078956158e+02 + 3 7.500000000000001e+02 -5.969032777715726e+02 4.416010667956443e+02 1.058063078956159e+02 + ME 2.003115322790910e+00 + +Event 239 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.188858424502662e+02 7.168183137005395e+02 -2.757703969522984e+01 + 3 7.500000000000000e+02 -2.188858424502661e+02 -7.168183137005394e+02 2.757703969522973e+01 + ME 2.010510537061426e+00 + +Event 240 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 5.906386180567445e+02 -1.043852672012196e+01 -4.621007033320046e+02 + 3 7.500000000000011e+02 -5.906386180567446e+02 1.043852672012131e+01 4.621007033320056e+02 + ME 2.180538801263046e+00 + +Event 241 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 1.310164484338845e+02 -7.028250186529380e+02 -2.266532227771519e+02 + 3 7.500000000000001e+02 -1.310164484338845e+02 7.028250186529380e+02 2.266532227771515e+02 + ME 1.985289439468672e+00 + +Event 242 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.060596880801855e+02 6.058870732428816e+02 3.189487761529025e+02 + 3 7.499999999999997e+02 3.060596880801855e+02 -6.058870732428816e+02 -3.189487761529026e+02 + ME 1.991689118163374e+00 + +Event 243 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.455428611464695e+02 6.177778614269365e-02 -3.818041521596316e+02 + 3 7.500000000000000e+02 -6.455428611464695e+02 -6.177778614266934e-02 3.818041521596317e+02 + ME 2.031321522676738e+00 + +Event 244 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 4.600023964153236e+02 5.665197502294575e+01 -5.896510400382238e+02 + 3 7.500000000000001e+02 -4.600023964153238e+02 -5.665197502294569e+01 5.896510400382238e+02 + ME 3.102724647631286e+00 + +Event 245 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -5.283205132260829e+02 3.195629486835196e+01 -5.313720264775102e+02 + 3 7.500000000000001e+02 5.283205132260829e+02 -3.195629486835195e+01 5.313720264775100e+02 + ME 2.504416662787443e+00 + +Event 246 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.245676679674773e+02 -7.153172514000221e+02 1.976340945196909e+01 + 3 7.500000000000003e+02 2.245676679674772e+02 7.153172514000221e+02 -1.976340945196911e+01 + ME 2.010791836793424e+00 + +Event 247 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.304482834256935e+02 1.189549860529900e+02 7.037352484843176e+02 + 3 7.500000000000000e+02 2.304482834256934e+02 -1.189549860529901e+02 -7.037352484843176e+02 + ME 8.380012727899235e+00 + +Event 248 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000014e+02 2.537801015626409e+02 -6.571572670245547e+02 -2.573713007459760e+02 + 3 7.500000000000005e+02 -2.537801015626402e+02 6.571572670245531e+02 2.573713007459746e+02 + ME 1.983507342724994e+00 + +Event 249 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 2.380072251709859e+02 4.007276857381548e+02 5.875967006794293e+02 + 3 7.500000000000013e+02 -2.380072251709867e+02 -4.007276857381548e+02 -5.875967006794292e+02 + ME 3.072885070943804e+00 + +Event 250 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 6.786028214439425e-01 -3.391021082103954e+02 6.689613589030946e+02 + 3 7.500000000000001e+02 -6.786028214447883e-01 3.391021082103958e+02 -6.689613589030951e+02 + ME 5.419965726617475e+00 + +Event 251 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 8.268187630824573e+01 3.486690416389526e+02 6.588578046382401e+02 + 3 7.499999999999982e+02 -8.268187630824475e+01 -3.486690416389528e+02 -6.588578046382391e+02 + ME 4.918030503810574e+00 + +Event 252 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000020e+02 -4.102134709467123e+02 -2.983471238084353e+02 5.524616746608582e+02 + 3 7.499999999999987e+02 4.102134709467135e+02 2.983471238084358e+02 -5.524616746608599e+02 + ME 2.671405704941876e+00 + +Event 253 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.361744772971164e+02 -1.872893882889482e+02 -7.133576920221781e+02 + 3 7.499999999999997e+02 1.361744772971161e+02 1.872893882889483e+02 7.133576920221780e+02 + ME 9.846978238614682e+00 + +Event 254 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 6.137214690822855e+02 -3.619701676967441e+02 2.341443061128604e+02 + 3 7.500000000000000e+02 -6.137214690822851e+02 3.619701676967441e+02 -2.341443061128601e+02 + ME 1.984625207398940e+00 + +Event 255 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 2.766263610363187e+02 -4.252208793533440e+02 5.524174690773032e+02 + 3 7.500000000000000e+02 -2.766263610363188e+02 4.252208793533439e+02 -5.524174690773031e+02 + ME 2.671007381446374e+00 + +Event 0 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 4.886369700926295e+02 -5.166396019380891e+02 -2.383640769242444e+02 + 3 7.499999999999995e+02 -4.886369700926294e+02 5.166396019380888e+02 2.383640769242449e+02 + ME 1.984309347522146e+00 + +Event 1 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 3.808436077386334e+02 -5.054156008216494e+02 4.025086544295820e+02 + 3 7.500000000000000e+02 -3.808436077386334e+02 5.054156008216497e+02 -4.025086544295819e+02 + ME 2.055984493577888e+00 + +Event 2 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999985e+02 5.183034673675862e+02 4.361721238657036e+02 -3.218934514357475e+02 + 3 7.499999999999985e+02 -5.183034673675862e+02 -4.361721238657033e+02 3.218934514357483e+02 + ME 1.992666939335083e+00 + +Event 3 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -7.346631711305881e+02 1.079089156707178e+02 1.054783906926816e+02 + 3 7.499999999999998e+02 7.346631711305880e+02 -1.079089156707179e+02 -1.054783906926816e+02 + ME 2.003160923858462e+00 + +Event 4 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 -3.860135335017674e+02 -5.185727762898844e+02 -3.802312817805502e+02 + 3 7.499999999999999e+02 3.860135335017676e+02 5.185727762898844e+02 3.802312817805511e+02 + ME 2.029736071461704e+00 + +Event 5 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.412146924854130e+01 6.250106998537266e+02 4.131553586783979e+02 + 3 7.500000000000000e+02 3.412146924854130e+01 -6.250106998537266e+02 -4.131553586783979e+02 + ME 2.071755473651405e+00 + +Event 6 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 5.103283769498017e+02 -1.882131351255163e+02 -5.163726981996678e+02 + 3 7.499999999999985e+02 -5.103283769498009e+02 1.882131351255149e+02 5.163726981996668e+02 + ME 2.409556649220553e+00 + +Event 7 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000011e+02 -5.382967505952907e+02 -1.493237627632333e+02 5.004388296013620e+02 + 3 7.500000000000005e+02 5.382967505952914e+02 1.493237627632330e+02 -5.004388296013620e+02 + ME 2.326090561404597e+00 + +Event 8 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000014e+02 -4.598745101308198e+02 -5.621086252728979e+02 -1.872146584158240e+02 + 3 7.500000000000009e+02 4.598745101308191e+02 5.621086252728995e+02 1.872146584158209e+02 + ME 1.990368021634358e+00 + +Event 9 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.775786374899710e+02 -3.096805779838333e+02 6.595936378762344e+02 + 3 7.499999999999978e+02 -1.775786374899704e+02 3.096805779838338e+02 -6.595936378762349e+02 + ME 4.951273970403449e+00 + +Event 10 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999965e+02 -5.131238447356303e+02 -5.270922542475897e+02 1.462110648204188e+02 + 3 7.499999999999962e+02 5.131238447356279e+02 5.270922542475851e+02 -1.462110648204196e+02 + ME 1.996940667767854e+00 + +Event 11 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.355823051698720e+02 3.562860272236928e+02 -1.777509498149288e+02 + 3 7.499999999999998e+02 6.355823051698721e+02 -3.562860272236929e+02 1.777509498149290e+02 + ME 1.991835916677821e+00 + +Event 12 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.902557542118753e+02 -4.044250918271754e+02 4.966294306674758e+02 + 3 7.499999999999995e+02 3.902557542118754e+02 4.044250918271755e+02 -4.966294306674758e+02 + ME 2.308413684383986e+00 + +Event 13 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.388642282518219e+02 -5.842429835301779e+02 4.050975396832451e+02 + 3 7.499999999999999e+02 -2.388642282518218e+02 5.842429835301775e+02 -4.050975396832447e+02 + ME 2.059610122598538e+00 + +Event 14 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 5.937999534112082e+02 4.550788359673929e+02 5.296100814192651e+01 + 3 7.499999999999998e+02 -5.937999534112082e+02 -4.550788359673929e+02 -5.296100814192653e+01 + ME 2.008979148314806e+00 + +Event 15 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -1.423179755450547e+02 4.493605018245672e+02 -5.833701511362493e+02 + 3 7.500000000000000e+02 1.423179755450545e+02 -4.493605018245675e+02 5.833701511362492e+02 + ME 3.014080798350171e+00 + +Event 16 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -2.706489889553401e+02 1.222987596040279e+02 -6.886887091979718e+02 + 3 7.500000000000003e+02 2.706489889553404e+02 -1.222987596040274e+02 6.886887091979718e+02 + ME 6.781287478663047e+00 + +Event 17 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -4.991385516924362e+02 -3.979472289417108e+02 3.936987543693655e+02 + 3 7.499999999999992e+02 4.991385516924362e+02 3.979472289417105e+02 -3.936987543693655e+02 + ME 2.044586663931617e+00 + +Event 18 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.716904304441446e+02 6.823352520667956e+02 1.519964269054312e+02 + 3 7.499999999999999e+02 -2.716904304441446e+02 -6.823352520667955e+02 -1.519964269054312e+02 + ME 1.996000455344738e+00 + +Event 19 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -4.773498418349229e+02 4.394024782830364e+02 -3.762480439535859e+02 + 3 7.499999999999997e+02 4.773498418349229e+02 -4.394024782830363e+02 3.762480439535859e+02 + ME 2.025886490708682e+00 + +Event 20 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 2.152723736932655e+01 -4.446684930211732e+02 -6.035780888712935e+02 + 3 7.499999999999998e+02 -2.152723736932609e+01 4.446684930211729e+02 6.035780888712937e+02 + ME 3.329386212830298e+00 + +Event 21 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.518768158716146e+02 2.713668944146131e+01 5.979723600333784e+02 + 3 7.500000000000000e+02 4.518768158716145e+02 -2.713668944146123e+01 -5.979723600333785e+02 + ME 3.232733732234588e+00 + +Event 22 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.605342203226538e+02 -5.585006861164979e+02 4.274329229812182e+02 + 3 7.500000000000001e+02 2.605342203226538e+02 5.585006861164979e+02 -4.274329229812183e+02 + ME 2.096776535139024e+00 + +Event 23 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.082800789219980e+02 -4.759337282317115e+02 2.786367666021681e+02 + 3 7.500000000000001e+02 -5.082800789219980e+02 4.759337282317115e+02 -2.786367666021681e+02 + ME 1.984127871373746e+00 + +Event 24 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -4.304443270717347e+02 -6.119700149046336e+02 5.206133065322422e+01 + 3 7.499999999999990e+02 4.304443270717353e+02 6.119700149046336e+02 -5.206133065322406e+01 + ME 2.009049052216004e+00 + +Event 25 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 9.357647372272227e+01 -4.354467328122423e+02 6.034315093270851e+02 + 3 7.499999999999998e+02 -9.357647372272230e+01 4.354467328122424e+02 -6.034315093270851e+02 + ME 3.326758172083423e+00 + +Event 26 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.406043478727808e+02 3.407510258582017e+02 -5.747846697561973e+02 + 3 7.500000000000003e+02 3.406043478727808e+02 -3.407510258582015e+02 5.747846697561973e+02 + ME 2.904517396756161e+00 + +Event 27 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 -3.284333987867702e+02 6.303438323961643e+02 -2.393703313309438e+02 + 3 7.499999999999999e+02 3.284333987867702e+02 -6.303438323961647e+02 2.393703313309439e+02 + ME 1.984240705194220e+00 + +Event 28 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 -1.029133508056653e+02 6.216832636308152e+02 -4.067170539174462e+02 + 3 7.499999999999976e+02 1.029133508056661e+02 -6.216832636308162e+02 4.067170539174470e+02 + ME 2.061944996606000e+00 + +Event 29 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -4.821390153127887e+02 5.538122935981005e+02 1.527544217783090e+02 + 3 7.499999999999990e+02 4.821390153127889e+02 -5.538122935981010e+02 -1.527544217783090e+02 + ME 1.995876946057352e+00 + +Event 30 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 9.489768343899584e+01 -7.281060338646737e+02 1.528268076214603e+02 + 3 7.500000000000006e+02 -9.489768343899605e+01 7.281060338646736e+02 -1.528268076214603e+02 + ME 1.995865148804488e+00 + +Event 31 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000011e+02 2.588924414604490e+02 -6.567632234918367e+02 -2.532523879912314e+02 + 3 7.500000000000003e+02 -2.588924414604486e+02 6.567632234918362e+02 2.532523879912308e+02 + ME 1.983585792596440e+00 + +Event 32 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.671713815281792e+02 4.353863719603355e+02 4.879793885994619e+02 + 3 7.500000000000005e+02 3.671713815281791e+02 -4.353863719603356e+02 -4.879793885994619e+02 + ME 2.271178879084551e+00 + +Event 33 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000023e+02 6.319452257542158e+02 1.146829159774965e+02 -3.872893755699342e+02 + 3 7.499999999999990e+02 -6.319452257542162e+02 -1.146829159774965e+02 3.872893755699342e+02 + ME 2.037151795755364e+00 + +Event 34 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.486096093516040e+02 -5.693192358255851e+02 4.202009874536406e+02 + 3 7.500000000000000e+02 -2.486096093516041e+02 5.693192358255851e+02 -4.202009874536406e+02 + ME 2.083512716248840e+00 + +Event 35 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -7.016777587248104e+02 2.605400911432719e+02 -4.761495372235753e+01 + 3 7.500000000000000e+02 7.016777587248104e+02 -2.605400911432719e+02 4.761495372235753e+01 + ME 2.009378076767459e+00 + +Event 36 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.992190538187328e+02 -1.415759996521825e+02 -4.282438112373230e+02 + 3 7.500000000000001e+02 5.992190538187328e+02 1.415759996521826e+02 4.282438112373230e+02 + ME 2.098343464958287e+00 + +Event 37 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.677951851527304e+02 1.710101924456866e+02 2.954743724361214e+02 + 3 7.499999999999999e+02 -6.677951851527304e+02 -1.710101924456866e+02 -2.954743724361211e+02 + ME 1.986100619014213e+00 + +Event 38 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.262233168639915e+01 -1.242788051966740e+02 7.369757269768426e+02 + 3 7.500000000000003e+02 6.262233168639881e+01 1.242788051966741e+02 -7.369757269768426e+02 + ME 1.689340277486609e+01 + +Event 39 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 6.702492999956165e+02 -1.498752499034606e+01 3.362160762813851e+02 + 3 7.500000000000003e+02 -6.702492999956166e+02 1.498752499034595e+01 -3.362160762813852e+02 + ME 1.998451081732437e+00 + +Event 40 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.153528524047671e+02 -7.173641694168138e+02 -3.888183892327462e+01 + 3 7.499999999999998e+02 2.153528524047672e+02 7.173641694168139e+02 3.888183892327462e+01 + ME 2.009943158764145e+00 + +Event 41 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000041e+02 -1.577959591347038e+02 1.068869602279858e+02 -7.253796337187771e+02 + 3 7.500000000000107e+02 1.577959591347143e+02 -1.068869602279727e+02 7.253796337187816e+02 + ME 1.254378447326961e+01 + +Event 42 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 2.605757710166904e+02 -3.124498327841131e+02 6.300598142654687e+02 + 3 7.500000000000003e+02 -2.605757710166903e+02 3.124498327841133e+02 -6.300598142654685e+02 + ME 3.913466437570163e+00 + +Event 43 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 7.087179265265712e+02 -2.450238073334031e+02 1.349942442185459e+01 + 3 7.500000000000000e+02 -7.087179265265711e+02 2.450238073334030e+02 -1.349942442185452e+01 + ME 2.010950853835399e+00 + +Event 44 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999985e+02 -1.936709096449666e+01 -3.211501049305160e+02 6.774861813282196e+02 + 3 7.499999999999973e+02 1.936709096449596e+01 3.211501049305142e+02 -6.774861813282163e+02 + ME 5.934203466747352e+00 + +Event 45 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -4.123591650584196e+02 -4.263717314995432e+02 -4.589848206317993e+02 + 3 7.500000000000002e+02 4.123591650584197e+02 4.263717314995431e+02 4.589848206317993e+02 + ME 2.171437953450995e+00 + +Event 46 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.417510221679373e+02 1.507214984231037e+02 -5.870596769721773e+02 + 3 7.500000000000000e+02 -4.417510221679373e+02 -1.507214984231037e+02 5.870596769721773e+02 + ME 3.065223124632955e+00 + +Event 47 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000018e+02 3.841729019576124e+02 -2.923622481869862e+02 5.739647177627895e+02 + 3 7.500000000000020e+02 -3.841729019576121e+02 2.923622481869870e+02 -5.739647177627897e+02 + ME 2.894697499858977e+00 + +Event 48 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 5.683917324787872e+02 -4.888657012953391e+02 2.100391741506460e+01 + 3 7.499999999999997e+02 -5.683917324787874e+02 4.888657012953389e+02 -2.100391741506465e+01 + ME 2.010753302031892e+00 + +Event 49 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.514168436731682e+01 -6.826028811438466e+00 7.479390349325276e+02 + 3 7.500000000000003e+02 5.514168436731686e+01 6.826028811438419e+00 -7.479390349325279e+02 + ME 2.474120018640866e+01 + +Event 50 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.984159499903414e+02 -4.045210751108285e+02 -5.566063425640616e+02 + 3 7.500000000000003e+02 2.984159499903414e+02 4.045210751108285e+02 5.566063425640616e+02 + ME 2.709765317407777e+00 + +Event 51 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -7.047061352594473e+02 -2.553121422897803e+02 -2.655132627753894e+01 + 3 7.499999999999998e+02 7.047061352594473e+02 2.553121422897803e+02 2.655132627753903e+01 + ME 2.010552676572688e+00 + +Event 52 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.049072352300011e+02 -2.374327835145617e+02 -6.812772550606136e+02 + 3 7.499999999999998e+02 -2.049072352300011e+02 2.374327835145616e+02 6.812772550606135e+02 + ME 6.196141531373791e+00 + +Event 53 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.762846372615920e+02 -3.488577990178416e+01 6.963833581971029e+02 + 3 7.499999999999997e+02 -2.762846372615919e+02 3.488577990178417e+01 -6.963833581971028e+02 + ME 7.516466508613185e+00 + +Event 54 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -5.965754309649121e+02 2.298482753022433e+02 3.921320256107800e+02 + 3 7.499999999999998e+02 5.965754309649124e+02 -2.298482753022436e+02 -3.921320256107794e+02 + ME 2.042705026930181e+00 + +Event 55 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000010e+02 5.249326727483450e+02 -5.207531728510860e+02 1.255460953068927e+02 + 3 7.499999999999985e+02 -5.249326727483449e+02 5.207531728510860e+02 -1.255460953068926e+02 + ME 2.000215452878511e+00 + +Event 56 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -7.488450735514020e+02 -4.113909777397586e+01 6.215340066190983e+00 + 3 7.500000000000000e+02 7.488450735514019e+02 4.113909777397589e+01 -6.215340066191020e+00 + ME 2.011060688550504e+00 + +Event 57 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.609915062325111e+02 7.065463493516471e+02 1.933235400535789e+02 + 3 7.499999999999999e+02 1.609915062325111e+02 -7.065463493516471e+02 -1.933235400535789e+02 + ME 1.989455455890459e+00 + +Event 58 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 1.364577375600944e+02 -7.175918329662071e+02 1.701212718039087e+02 + 3 7.499999999999997e+02 -1.364577375600944e+02 7.175918329662074e+02 -1.701212718039087e+02 + ME 1.993053556927237e+00 + +Event 59 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.772109336552556e+02 -3.832540253228650e+02 2.871304409891935e+02 + 3 7.500000000000000e+02 5.772109336552556e+02 3.832540253228651e+02 -2.871304409891938e+02 + ME 1.984937386100476e+00 + +Event 60 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.416667813988462e+02 -5.270418494550995e+02 4.098666849251105e+02 + 3 7.499999999999986e+02 3.416667813988453e+02 5.270418494550997e+02 -4.098666849251106e+02 + ME 2.066637019285051e+00 + +Event 61 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -4.259888540845536e+02 -6.048881415507168e+02 -1.230602795667552e+02 + 3 7.500000000000003e+02 4.259888540845541e+02 6.048881415507168e+02 1.230602795667548e+02 + ME 2.000595572517847e+00 + +Event 62 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 4.607995913538437e+02 -5.197207754200592e+02 -2.829382480416942e+02 + 3 7.500000000000006e+02 -4.607995913538437e+02 5.197207754200587e+02 2.829382480416937e+02 + ME 1.984493092557474e+00 + +Event 63 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -1.281800042196733e+02 -1.771681287919233e+02 -7.174129498821534e+02 + 3 7.500000000000002e+02 1.281800042196737e+02 1.771681287919234e+02 7.174129498821534e+02 + ME 1.062295525526163e+01 + +Event 64 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 6.405153128760093e+02 -3.399997059964824e+02 -1.914166499906788e+02 + 3 7.500000000000001e+02 -6.405153128760090e+02 3.399997059964824e+02 1.914166499906790e+02 + ME 1.989736848495627e+00 + +Event 65 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.838295106811601e+02 2.142358614545267e+02 -6.948450954490232e+02 + 3 7.499999999999998e+02 -1.838295106811600e+02 -2.142358614545267e+02 6.948450954490232e+02 + ME 7.357277043327562e+00 + +Event 66 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 5.913261705169194e+02 -1.414127723755366e+02 4.391307184322907e+02 + 3 7.500000000000002e+02 -5.913261705169197e+02 1.414127723755367e+02 -4.391307184322907e+02 + ME 2.121048094879748e+00 + +Event 67 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.975437950835899e+02 -2.686137949018978e+01 -4.524598095400129e+02 + 3 7.500000000000002e+02 5.975437950835899e+02 2.686137949018994e+01 4.524598095400129e+02 + ME 2.153479890398904e+00 + +Event 68 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 5.181330717451524e+02 -5.368392105903471e+02 7.643155066244263e+01 + 3 7.499999999999997e+02 -5.181330717451524e+02 5.368392105903467e+02 -7.643155066244277e+01 + ME 2.006777808229542e+00 + +Event 69 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.851952108095214e+02 3.816438081633558e+02 -6.185068613878011e+02 + 3 7.500000000000001e+02 -1.851952108095216e+02 -3.816438081633559e+02 6.185068613878012e+02 + ME 3.629256087665883e+00 + +Event 70 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.420936051939211e+02 -6.644346036493946e+02 6.323466421748260e+01 + 3 7.500000000000002e+02 -3.420936051939210e+02 6.644346036493946e+02 -6.323466421748285e+01 + ME 2.008103300392967e+00 + +Event 71 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.014973205429060e+02 -5.862298917340072e+02 4.221769106092424e+02 + 3 7.499999999999995e+02 -2.014973205429060e+02 5.862298917340071e+02 -4.221769106092424e+02 + ME 2.087012501014583e+00 + +Event 72 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999983e+02 -5.128995270704372e+02 -5.335011449610839e+02 -1.216988227393155e+02 + 3 7.500000000000007e+02 5.128995270704373e+02 5.335011449610839e+02 1.216988227393155e+02 + ME 2.000802145188290e+00 + +Event 73 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 6.269260124380270e+02 7.328600119139391e+01 -4.050838641046208e+02 + 3 7.499999999999995e+02 -6.269260124380268e+02 -7.328600119139392e+01 4.050838641046204e+02 + ME 2.059590627293980e+00 + +Event 74 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.868628802298931e+02 4.654417523012835e+02 3.815671661692834e+01 + 3 7.500000000000000e+02 -5.868628802298932e+02 -4.654417523012835e+02 -3.815671661692834e+01 + ME 2.009985162172173e+00 + +Event 75 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.396554044798227e+02 3.890541063878783e+02 -4.447320342052838e+01 + 3 7.499999999999999e+02 -6.396554044798226e+02 -3.890541063878784e+02 4.447320342052834e+01 + ME 2.009593868310018e+00 + +Event 76 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 2.541578029035597e+02 6.696747494064764e+02 2.223500421196449e+02 + 3 7.499999999999998e+02 -2.541578029035597e+02 -6.696747494064765e+02 -2.223500421196449e+02 + ME 1.985725329848053e+00 + +Event 77 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -1.794085909550948e+02 -4.035830095427122e+02 -6.061627767357163e+02 + 3 7.500000000000000e+02 1.794085909550948e+02 4.035830095427124e+02 6.061627767357163e+02 + ME 3.376661105012696e+00 + +Event 78 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.251897931439634e+01 5.985924556239656e+02 4.513091707917184e+02 + 3 7.499999999999999e+02 2.251897931439627e+01 -5.985924556239656e+02 -4.513091707917183e+02 + ME 2.150461455825715e+00 + +Event 79 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -8.069660169556643e+01 4.506184873965207e+02 -5.940799923337418e+02 + 3 7.499999999999999e+02 8.069660169556641e+01 -4.506184873965206e+02 5.940799923337419e+02 + ME 3.170027479232333e+00 + +Event 80 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 7.302693508383343e+02 -1.702453183034974e+02 1.494010782754770e+01 + 3 7.500000000000000e+02 -7.302693508383343e+02 1.702453183034974e+02 -1.494010782754776e+01 + ME 2.010919551852894e+00 + +Event 81 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -3.571124181827449e+02 -1.296917603996396e+02 -6.466457825302152e+02 + 3 7.500000000000003e+02 3.571124181827448e+02 1.296917603996393e+02 6.466457825302152e+02 + ME 4.428752434825682e+00 + +Event 82 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -2.079791544122488e+02 6.603817609425097e+02 2.883411194131628e+02 + 3 7.499999999999999e+02 2.079791544122487e+02 -6.603817609425097e+02 -2.883411194131628e+02 + ME 1.985082613506332e+00 + +Event 83 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999991e+02 2.106645025753154e+02 2.591228168950026e+02 6.715473424257536e+02 + 3 7.500000000000000e+02 -2.106645025753154e+02 -2.591228168950025e+02 -6.715473424257551e+02 + ME 5.566077345203682e+00 + +Event 84 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.829033392465455e+02 5.412230867109214e+02 4.353656750988264e+02 + 3 7.499999999999999e+02 -2.829033392465454e+02 -5.412230867109214e+02 -4.353656750988264e+02 + ME 2.112834869994339e+00 + +Event 85 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.452959441952361e+02 3.191655713577259e+02 -5.842979111568145e+02 + 3 7.500000000000001e+02 3.452959441952361e+02 -3.191655713577259e+02 5.842979111568145e+02 + ME 3.026699300876776e+00 + +Event 86 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.315896816222929e+02 -5.262181855033000e+02 3.151583220585842e+02 + 3 7.500000000000000e+02 4.315896816222929e+02 5.262181855033001e+02 -3.151583220585842e+02 + ME 1.990528310218343e+00 + +Event 87 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.811116980879163e+02 -3.427907013503557e+02 -5.474928389013653e+02 + 3 7.500000000000002e+02 -3.811116980879164e+02 3.427907013503558e+02 5.474928389013651e+02 + ME 2.628000046826205e+00 + +Event 88 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.146396045022380e+01 -6.744579332872387e+02 -3.222245766413236e+02 + 3 7.499999999999998e+02 6.146396045022396e+01 6.744579332872382e+02 3.222245766413236e+02 + ME 1.992781166334666e+00 + +Event 89 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000009e+02 -9.261861454486514e+01 7.306822737190313e+02 -1.415104487752590e+02 + 3 7.499999999999997e+02 9.261861454486517e+01 -7.306822737190316e+02 1.415104487752590e+02 + ME 1.997699525087340e+00 + +Event 90 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -4.139376319099176e+02 3.915039386019012e+02 4.877297437598081e+02 + 3 7.499999999999999e+02 4.139376319099175e+02 -3.915039386019013e+02 -4.877297437598081e+02 + ME 2.270161148116216e+00 + +Event 91 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 7.126395371709493e+00 -7.420663234346179e+02 -1.085669660312691e+02 + 3 7.499999999999997e+02 -7.126395371709608e+00 7.420663234346180e+02 1.085669660312693e+02 + ME 2.002727750198704e+00 + +Event 92 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -1.726503622084788e+02 -6.521623274545965e+02 -3.276830039508977e+02 + 3 7.499999999999992e+02 1.726503622084796e+02 6.521623274545966e+02 3.276830039508984e+02 + ME 1.994792831040187e+00 + +Event 93 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 4.209821045541699e+02 -6.195820136229019e+02 -3.731214333339776e+01 + 3 7.500000000000000e+02 -4.209821045541700e+02 6.195820136229019e+02 3.731214333339766e+01 + ME 2.010033124088934e+00 + +Event 94 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999984e+02 6.269909815974254e+02 3.639809310785994e+02 1.920942237722172e+02 + 3 7.499999999999984e+02 -6.269909815974265e+02 -3.639809310786015e+02 -1.920942237722174e+02 + ME 1.989636481366654e+00 + +Event 95 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.553133048499316e+02 5.741086360057838e+02 -1.599658353843466e+02 + 3 7.500000000000000e+02 -4.553133048499316e+02 -5.741086360057838e+02 1.599658353843466e+02 + ME 1.994700919179108e+00 + +Event 96 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.565619044698291e+02 6.422192246258138e+02 -2.902248381786659e+02 + 3 7.499999999999999e+02 -2.565619044698291e+02 -6.422192246258138e+02 2.902248381786659e+02 + ME 1.985324124083277e+00 + +Event 97 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.039638951579397e+02 -2.118244738005755e+02 -5.953516303059346e+02 + 3 7.500000000000000e+02 -4.039638951579398e+02 2.118244738005755e+02 5.953516303059346e+02 + ME 3.190135488781065e+00 + +Event 98 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -2.854056619662841e+02 -6.558450095995934e+02 2.256345086658882e+02 + 3 7.500000000000000e+02 2.854056619662841e+02 6.558450095995934e+02 -2.256345086658882e+02 + ME 1.985389237734055e+00 + +Event 99 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 6.845741170030386e+02 -2.951022215571765e+02 8.229797787022088e+01 + 3 7.499999999999983e+02 -6.845741170030385e+02 2.951022215571757e+02 -8.229797787021991e+01 + ME 2.006120622294754e+00 + +Event 100 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.131199432162236e+02 6.658078482508749e+02 3.261959344539691e+02 + 3 7.499999999999998e+02 -1.131199432162237e+02 -6.658078482508747e+02 -3.261959344539691e+02 + ME 1.994220336919275e+00 + +Event 101 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999974e+02 6.281139753924398e+02 1.799537449267306e+01 -4.094496311068950e+02 + 3 7.499999999999968e+02 -6.281139753924423e+02 -1.799537449266958e+01 4.094496311068941e+02 + ME 2.066004084437084e+00 + +Event 102 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.331822864934145e+02 3.520635566235372e+02 -5.009035423770954e+02 + 3 7.499999999999998e+02 -4.331822864934146e+02 -3.520635566235372e+02 5.009035423770955e+02 + ME 2.328303353515298e+00 + +Event 103 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.092502234787573e+02 -1.620958235289011e+02 -4.062500574591608e+02 + 3 7.499999999999999e+02 -6.092502234787573e+02 1.620958235289010e+02 4.062500574591608e+02 + ME 2.061266375378142e+00 + +Event 104 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -7.390318022514667e+02 7.866020384803387e+01 -1.007202441991433e+02 + 3 7.500000000000001e+02 7.390318022514666e+02 -7.866020384803385e+01 1.007202441991434e+02 + ME 2.003811783293278e+00 + +Event 105 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.116661021739875e+02 -1.805958333876086e+02 -7.193162215523179e+02 + 3 7.500000000000001e+02 1.116661021739881e+02 1.805958333876084e+02 7.193162215523182e+02 + ME 1.102877972363151e+01 + +Event 106 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -6.896019143353706e+02 -3.499565434426603e+01 -2.927874722764089e+02 + 3 7.500000000000000e+02 6.896019143353705e+02 3.499565434426599e+01 2.927874722764089e+02 + ME 1.985683803751423e+00 + +Event 107 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.876020488404372e+02 4.472601133788148e+02 1.310809794702898e+02 + 3 7.500000000000000e+02 5.876020488404371e+02 -4.472601133788147e+02 -1.310809794702898e+02 + ME 1.999356513234769e+00 + +Event 108 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.474590340134895e+02 7.351525828033791e+02 1.750752082190591e+01 + 3 7.499999999999999e+02 1.474590340134896e+02 -7.351525828033792e+02 -1.750752082190613e+01 + ME 2.010855956167804e+00 + +Event 109 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 2.957445751534648e+02 5.717428497601883e+02 -3.848964276469646e+02 + 3 7.499999999999997e+02 -2.957445751534648e+02 -5.717428497601884e+02 3.848964276469649e+02 + ME 2.034549794585122e+00 + +Event 110 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.063365970415883e+00 -3.265074122789853e+02 6.751711970456008e+02 + 3 7.500000000000002e+02 -6.063365970415944e+00 3.265074122789853e+02 -6.751711970456010e+02 + ME 5.784963616984554e+00 + +Event 111 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -7.490455319369717e+02 -3.406507833202949e+01 -1.644267385470533e+01 + 3 7.500000000000010e+02 7.490455319369723e+02 3.406507833202959e+01 1.644267385470358e+01 + ME 2.010883546212678e+00 + +Event 112 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.374207045694743e+02 5.497834357859954e+01 7.093016218641318e+02 + 3 7.500000000000000e+02 -2.374207045694743e+02 -5.497834357859954e+01 -7.093016218641318e+02 + ME 9.172605741240773e+00 + +Event 113 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 5.836102630475973e+01 7.112469905859641e+02 2.306983072999552e+02 + 3 7.500000000000000e+02 -5.836102630475965e+01 -7.112469905859641e+02 -2.306983072999552e+02 + ME 1.984915059360817e+00 + +Event 114 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.832697985127738e+02 -6.792924150459122e+02 1.442914970177566e+02 + 3 7.500000000000001e+02 2.832697985127741e+02 6.792924150459122e+02 -1.442914970177563e+02 + ME 1.997251242389987e+00 + +Event 115 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -3.877068793675994e+02 -5.687932010880680e+02 2.977543787873256e+02 + 3 7.499999999999999e+02 3.877068793675994e+02 5.687932010880680e+02 -2.977543787873257e+02 + ME 1.986487190210642e+00 + +Event 116 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.888638462220085e+02 -2.429234055910374e+02 -6.481094794568894e+02 + 3 7.499999999999998e+02 2.888638462220085e+02 2.429234055910374e+02 6.481094794568892e+02 + ME 4.481760920260451e+00 + +Event 117 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.057325810417608e+02 -2.863650344598338e+02 -6.850661926741573e+02 + 3 7.500000000000002e+02 1.057325810417607e+02 2.863650344598338e+02 6.850661926741573e+02 + ME 6.482158221852050e+00 + +Event 118 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.434912620455068e+02 4.133496415655201e+02 -5.231212351989174e+02 + 3 7.500000000000001e+02 3.434912620455068e+02 -4.133496415655201e+02 5.231212351989174e+02 + ME 2.450087544872921e+00 + +Event 119 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 5.337372537382147e+02 -1.304062014214320e+00 5.268992725315253e+02 + 3 7.499999999999994e+02 -5.337372537382158e+02 1.304062014214380e+00 -5.268992725315243e+02 + ME 2.474282470920531e+00 + +Event 120 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000020e+02 2.526044577837270e+02 -5.022225052464477e+02 4.964509473571084e+02 + 3 7.499999999999982e+02 -2.526044577837286e+02 5.022225052464470e+02 -4.964509473571077e+02 + ME 2.307605300736761e+00 + +Event 121 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -9.983643901956184e+01 7.430481637446572e+02 2.030053692852349e+01 + 3 7.500000000000001e+02 9.983643901956184e+01 -7.430481637446572e+02 -2.030053692852350e+01 + ME 2.010775436568609e+00 + +Event 122 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.972556630245654e+02 1.216831948271524e+02 -2.480478592466205e+02 + 3 7.499999999999999e+02 -6.972556630245654e+02 -1.216831948271524e+02 2.480478592466204e+02 + ME 1.983764132347880e+00 + +Event 123 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999990e+02 -6.140782040297679e+02 4.305900507141508e+02 4.093428375274711e-01 + 3 7.499999999999983e+02 6.140782040297669e+02 -4.305900507141531e+02 -4.093428375271901e-01 + ME 2.011090139533466e+00 + +Event 124 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999911e+02 3.588901340872197e+01 -1.799600883903310e+02 -7.272044728293959e+02 + 3 7.499999999999977e+02 -3.588901340872244e+01 1.799600883903320e+02 7.272044728294055e+02 + ME 1.308008567657206e+01 + +Event 125 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 7.215024155075264e+02 1.793419884871221e+02 9.884693006001493e+01 + 3 7.499999999999999e+02 -7.215024155075264e+02 -1.793419884871222e+02 -9.884693006001490e+01 + ME 2.004062293875886e+00 + +Event 126 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 1.281551412895951e+02 -2.336695998906990e+02 -7.010526213116709e+02 + 3 7.500000000000001e+02 -1.281551412895947e+02 2.336695998906987e+02 7.010526213116710e+02 + ME 8.043521587411112e+00 + +Event 127 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.957501429351264e+02 4.658358804290751e+02 -5.542371460356527e+02 + 3 7.500000000000011e+02 1.957501429351260e+02 -4.658358804290745e+02 5.542371460356528e+02 + ME 2.687590346933034e+00 + +Event 128 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.318760792775902e+01 -6.964451407927102e+02 2.780112459296673e+02 + 3 7.500000000000003e+02 -1.318760792775899e+01 6.964451407927103e+02 -2.780112459296673e+02 + ME 1.984082110439539e+00 + +Event 129 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.550350801573031e+02 -6.509004311564038e+02 3.388107918013039e+02 + 3 7.500000000000002e+02 -1.550350801573031e+02 6.509004311564037e+02 -3.388107918013039e+02 + ME 1.999695588062834e+00 + +Event 130 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -1.645749092425722e+02 -7.293083567438612e+02 5.936682601728498e+01 + 3 7.499999999999986e+02 1.645749092425725e+02 7.293083567438607e+02 -5.936682601728561e+01 + ME 2.008449590658919e+00 + +Event 131 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.289732569218596e+02 -7.242036888606353e+02 -1.462700107322627e+02 + 3 7.500000000000002e+02 -1.289732569218595e+02 7.242036888606352e+02 1.462700107322627e+02 + ME 1.996931117530840e+00 + +Event 132 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999991e+02 6.604855820021804e+01 3.651487935846834e+02 -6.517698570073040e+02 + 3 7.499999999999978e+02 -6.604855820021821e+01 -3.651487935846866e+02 6.517698570073051e+02 + ME 4.620643930585398e+00 + +Event 133 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.812417328859055e+02 -6.435356503900175e+02 5.492365413234045e+01 + 3 7.500000000000000e+02 3.812417328859055e+02 6.435356503900175e+02 -5.492365413234045e+01 + ME 2.008822805330303e+00 + +Event 134 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -6.567102166802744e+02 -3.252202169638837e+02 1.595728729631448e+02 + 3 7.499999999999997e+02 6.567102166802742e+02 3.252202169638837e+02 -1.595728729631451e+02 + ME 1.994764975698830e+00 + +Event 135 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.837316099570886e+02 -6.854901002516958e+02 1.099986180280411e+02 + 3 7.500000000000001e+02 2.837316099570886e+02 6.854901002516958e+02 -1.099986180280411e+02 + ME 2.002524237018472e+00 + +Event 136 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -1.445964163493952e+02 -3.365935275091168e+02 6.544437895021098e+02 + 3 7.499999999999998e+02 1.445964163493952e+02 3.365935275091168e+02 -6.544437895021098e+02 + ME 4.728186824157167e+00 + +Event 137 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999986e+02 2.788569431822099e+02 -2.310081397848981e+02 6.567907159759458e+02 + 3 7.499999999999999e+02 -2.788569431822102e+02 2.310081397848975e+02 -6.567907159759453e+02 + ME 4.827133279595881e+00 + +Event 138 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.519171223259784e+02 5.310903622506274e+02 -3.957238508585245e+02 + 3 7.500000000000000e+02 -3.519171223259784e+02 -5.310903622506274e+02 3.957238508585245e+02 + ME 2.047082251362772e+00 + +Event 139 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.739397549265399e+02 -2.618238911224230e+02 -5.950775661399049e+02 + 3 7.500000000000000e+02 -3.739397549265399e+02 2.618238911224231e+02 5.950775661399049e+02 + ME 3.185771296827542e+00 + +Event 140 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.980454399514575e+02 1.394595291726036e+02 -2.361855276809563e+02 + 3 7.500000000000002e+02 -6.980454399514578e+02 -1.394595291726037e+02 2.361855276809564e+02 + ME 1.984466872167745e+00 + +Event 141 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 4.653840213895683e+02 -2.880922628106119e+01 5.874416916736162e+02 + 3 7.500000000000005e+02 -4.653840213895683e+02 2.880922628106116e+01 -5.874416916736163e+02 + ME 3.070667674723962e+00 + +Event 142 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 2.270562981552866e+02 -5.222032422100018e+02 4.881077865527037e+02 + 3 7.500000000000000e+02 -2.270562981552866e+02 5.222032422100018e+02 -4.881077865527037e+02 + ME 2.271703521969705e+00 + +Event 143 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 4.278386633790234e+02 5.011307304374201e+02 3.582206989124277e+02 + 3 7.500000000000007e+02 -4.278386633790235e+02 -5.011307304374207e+02 -3.582206989124284e+02 + ME 2.011195243007343e+00 + +Event 144 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.485631413852261e+02 -2.868202125643110e+02 -2.441311477486854e+02 + 3 7.499999999999998e+02 6.485631413852262e+02 2.868202125643110e+02 2.441311477486855e+02 + ME 1.983952824756691e+00 + +Event 145 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.489272678263819e+02 -2.043717898946646e-02 5.110566056896098e+02 + 3 7.500000000000000e+02 -5.489272678263819e+02 2.043717898953842e-02 -5.110566056896097e+02 + ME 2.379895727846479e+00 + +Event 146 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.879378412153704e+02 -6.925213285531936e+02 2.451345579835954e+00 + 3 7.500000000000000e+02 2.879378412153704e+02 6.925213285531936e+02 -2.451345579835899e+00 + ME 2.011085665785286e+00 + +Event 147 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.952915705099506e+02 1.689784617132080e+02 -2.247574502766133e+02 + 3 7.500000000000000e+02 -6.952915705099505e+02 -1.689784617132080e+02 2.247574502766133e+02 + ME 1.985476870550384e+00 + +Event 148 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.957082656749750e+02 1.510461070560446e+02 2.359556793440931e+02 + 3 7.499999999999999e+02 6.957082656749749e+02 -1.510461070560447e+02 -2.359556793440931e+02 + ME 1.984484189288116e+00 + +Event 149 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.906011724554260e+02 -4.023428219257597e+02 4.980471641534193e+02 + 3 7.500000000000000e+02 3.906011724554261e+02 4.023428219257598e+02 -4.980471641534193e+02 + ME 2.314897321112815e+00 + +Event 150 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000017e+02 3.825548542963734e+02 3.607661939729296e+02 -5.347892451616488e+02 + 3 7.500000000000000e+02 -3.825548542963752e+02 -3.607661939729286e+02 5.347892451616495e+02 + ME 2.528585842387213e+00 + +Event 151 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -8.096198305186688e+01 -6.966591351540006e+02 -2.657276927736156e+02 + 3 7.500000000000000e+02 8.096198305186688e+01 6.966591351540006e+02 2.657276927736156e+02 + ME 1.983534026450009e+00 + +Event 152 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -3.053425729950391e+02 6.849509469742742e+02 1.039775724928324e+01 + 3 7.499999999999995e+02 3.053425729950393e+02 -6.849509469742737e+02 -1.039775724928318e+01 + ME 2.011007520774094e+00 + +Event 153 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -9.852328869759022e+01 -1.550106591050277e+02 7.271621945261154e+02 + 3 7.500000000000001e+02 9.852328869759016e+01 1.550106591050275e+02 -7.271621945261154e+02 + ME 1.306716793396641e+01 + +Event 154 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.148528978665797e+01 -7.103339962655652e+02 2.386090741703716e+02 + 3 7.499999999999999e+02 3.148528978665796e+01 7.103339962655652e+02 -2.386090741703715e+02 + ME 1.984292391971781e+00 + +Event 155 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.874782580333551e+02 2.540696668148104e+02 -5.102093220918027e+02 + 3 7.500000000000001e+02 4.874782580333551e+02 -2.540696668148104e+02 5.102093220918026e+02 + ME 2.375343013454757e+00 + +Event 156 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 5.351140291898049e+02 3.025756173962970e+02 4.296521517710606e+02 + 3 7.500000000000002e+02 -5.351140291898049e+02 -3.025756173962971e+02 -4.296521517710606e+02 + ME 2.101104406431960e+00 + +Event 157 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.236396872289433e+01 2.448362281552735e+02 -7.088034661738875e+02 + 3 7.499999999999993e+02 -1.236396872289407e+01 -2.448362281552726e+02 7.088034661738876e+02 + ME 9.095855285444577e+00 + +Event 158 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 4.792285853193343e+02 5.225230422173582e+02 2.445600812985376e+02 + 3 7.500000000000003e+02 -4.792285853193339e+02 -5.225230422173579e+02 -2.445600812985381e+02 + ME 1.983929981579759e+00 + +Event 159 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.994239737611111e+02 4.487043427103748e+02 4.307333877573547e+01 + 3 7.499999999999999e+02 5.994239737611113e+02 -4.487043427103749e+02 -4.307333877573552e+01 + ME 2.009685515458492e+00 + +Event 160 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.056348972134737e+00 -5.530529151025127e+02 -5.065879560584343e+02 + 3 7.500000000000001e+02 1.056348972134632e+00 5.530529151025127e+02 5.065879560584343e+02 + ME 2.356403770495644e+00 + +Event 161 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.175060175378059e+02 -6.201939914735475e+02 5.956625101373001e+01 + 3 7.499999999999997e+02 4.175060175378057e+02 6.201939914735476e+02 -5.956625101373005e+01 + ME 2.008432219179860e+00 + +Event 162 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000131e+02 -3.532767836309999e+02 -6.048160389091212e+02 2.681288369899751e+02 + 3 7.499999999999952e+02 3.532767836310093e+02 6.048160389091256e+02 -2.681288369899763e+02 + ME 1.983591149479119e+00 + +Event 163 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 3.455155148577347e+01 -5.419588320322101e+02 5.172879417425158e+02 + 3 7.499999999999998e+02 -3.455155148577347e+01 5.419588320322102e+02 -5.172879417425160e+02 + ME 2.414859767910284e+00 + +Event 164 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 1.203138720765576e+02 6.593334144494559e+02 3.366066261623008e+02 + 3 7.500000000000002e+02 -1.203138720765575e+02 -6.593334144494565e+02 -3.366066261623006e+02 + ME 1.998634330981031e+00 + +Event 165 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.816823713406797e+02 -1.602945128408126e+02 7.097867180337724e+02 + 3 7.500000000000001e+02 1.816823713406797e+02 1.602945128408125e+02 -7.097867180337724e+02 + ME 9.248548838925846e+00 + +Event 166 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -4.857732699129311e+01 -2.617520387305624e+02 7.011605476085883e+02 + 3 7.500000000000001e+02 4.857732699129315e+01 2.617520387305623e+02 -7.011605476085884e+02 + ME 8.056552904397961e+00 + +Event 167 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -4.900092704286321e+01 -6.320471173340015e+02 4.007684501288983e+02 + 3 7.500000000000000e+02 4.900092704286319e+01 6.320471173340014e+02 -4.007684501288983e+02 + ME 2.053619707066993e+00 + +Event 168 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.264562441053386e+02 -1.340492719285199e+01 -5.340076132130089e+02 + 3 7.500000000000000e+02 -5.264562441053386e+02 1.340492719285199e+01 5.340076132130089e+02 + ME 2.522967143225648e+00 + +Event 169 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.804167788417345e+02 6.461491047275698e+02 1.656529008013751e+01 + 3 7.500000000000001e+02 3.804167788417345e+02 -6.461491047275698e+02 -1.656529008013748e+01 + ME 2.010880456803565e+00 + +Event 170 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -8.097738208309649e+01 -3.146836244882357e+02 6.759562708266826e+02 + 3 7.500000000000000e+02 8.097738208309650e+01 3.146836244882357e+02 -6.759562708266826e+02 + ME 5.834715064701833e+00 + +Event 171 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -7.540116185339261e+01 -3.719303863809235e+02 -6.469021969955712e+02 + 3 7.499999999999987e+02 7.540116185339261e+01 3.719303863809236e+02 6.469021969955708e+02 + ME 4.437938361437913e+00 + +Event 172 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999991e+02 -4.828016172825367e+02 -5.140078459318597e+02 -2.553400334257376e+02 + 3 7.499999999999998e+02 4.828016172825360e+02 5.140078459318594e+02 2.553400334257384e+02 + ME 1.983538851321157e+00 + +Event 173 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.987927902937172e+01 -2.766666825658829e+01 7.462247930951604e+02 + 3 7.499999999999999e+02 -6.987927902937163e+01 2.766666825658830e+01 -7.462247930951604e+02 + ME 2.309142129384764e+01 + +Event 174 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 4.529896621351459e+02 -2.479499892141398e+02 5.438944464208055e+02 + 3 7.500000000000001e+02 -4.529896621351459e+02 2.479499892141399e+02 -5.438944464208055e+02 + ME 2.598220410305965e+00 + +Event 175 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 1.641260263514688e+02 6.805045353716746e+02 -2.692140873220751e+02 + 3 7.499999999999992e+02 -1.641260263514683e+02 -6.805045353716744e+02 2.692140873220759e+02 + ME 1.983624622951714e+00 + +Event 176 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -4.449083452207172e+02 3.512022859276666e+02 4.911349292324311e+02 + 3 7.500000000000003e+02 4.449083452207170e+02 -3.512022859276664e+02 -4.911349292324313e+02 + ME 2.284312202099852e+00 + +Event 177 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 6.756184157382538e+02 2.756014812918595e+02 1.734461871159609e+02 + 3 7.499999999999999e+02 -6.756184157382538e+02 -2.756014812918595e+02 -1.734461871159609e+02 + ME 1.992520025439582e+00 + +Event 178 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -7.899028545581659e+01 6.651246644987089e+02 3.374458704430804e+02 + 3 7.499999999999997e+02 7.899028545581672e+01 -6.651246644987090e+02 -3.374458704430803e+02 + ME 1.999032967119208e+00 + +Event 179 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000032e+02 -5.649043925133968e+02 -4.752925559656212e+02 1.322120023399303e+02 + 3 7.500000000000028e+02 5.649043925133992e+02 4.752925559656279e+02 -1.322120023399242e+02 + ME 1.999179049147394e+00 + +Event 180 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 6.443749249963931e+02 -3.789807589401902e+02 6.045279471629055e+01 + 3 7.499999999999998e+02 -6.443749249963931e+02 3.789807589401904e+02 -6.045279471629056e+01 + ME 2.008354355265392e+00 + +Event 181 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -7.487240721466687e+02 -4.365618810460560e+01 2.530025352630979e+00 + 3 7.499999999999992e+02 7.487240721466690e+02 4.365618810460511e+01 -2.530025352631356e+00 + ME 2.011085365634105e+00 + +Event 182 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -1.736127789510566e+02 -4.849363338101804e+02 5.451562667122458e+02 + 3 7.500000000000002e+02 1.736127789510568e+02 4.849363338101802e+02 -5.451562667122456e+02 + ME 2.608510167984170e+00 + +Event 183 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -1.465394368041592e+02 7.111974597422168e+02 1.876815566786921e+02 + 3 7.499999999999994e+02 1.465394368041591e+02 -7.111974597422169e+02 -1.876815566786921e+02 + ME 1.990297183267100e+00 + +Event 184 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000016e+02 -1.708920160408793e+02 -6.330544605807507e+02 3.640576448755041e+02 + 3 7.500000000000007e+02 1.708920160408793e+02 6.330544605807504e+02 -3.640576448755053e+02 + ME 2.015490029126271e+00 + +Event 185 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.460547321795563e+02 -2.478493945403630e+02 5.496415682298780e+02 + 3 7.499999999999998e+02 4.460547321795562e+02 2.478493945403631e+02 -5.496415682298780e+02 + ME 2.646436363519030e+00 + +Event 186 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.118512395965141e+01 5.909417855163442e+02 4.613447698344142e+02 + 3 7.499999999999989e+02 2.118512395965102e+01 -5.909417855163435e+02 -4.613447698344139e+02 + ME 2.178298627452733e+00 + +Event 187 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.653808167750950e+02 -7.001457156827445e+02 -4.323191990741312e+01 + 3 7.500000000000000e+02 -2.653808167750950e+02 7.001457156827445e+02 4.323191990741302e+01 + ME 2.009675273524294e+00 + +Event 188 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 6.408357091392249e+02 3.883393337910243e+02 -3.197117018143499e+01 + 3 7.500000000000001e+02 -6.408357091392249e+02 -3.883393337910244e+02 3.197117018143511e+01 + ME 2.010312327528764e+00 + +Event 189 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000103e+02 6.049264826447564e+01 2.924911886358318e+01 7.469840216173631e+02 + 3 7.500000000000013e+02 -6.049264826448120e+01 -2.924911886358490e+01 -7.469840216173664e+02 + ME 2.379552267034401e+01 + +Event 190 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -3.303745399677809e+02 -6.726435165214554e+02 -3.005599811564162e+01 + 3 7.499999999999998e+02 3.303745399677807e+02 6.726435165214554e+02 3.005599811564161e+01 + ME 2.010402233084874e+00 + +Event 191 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000059e+02 2.507415243158216e+02 -6.861277073704366e+02 1.698748279244940e+02 + 3 7.500000000000019e+02 -2.507415243158149e+02 6.861277073704406e+02 -1.698748279244789e+02 + ME 1.993093251791711e+00 + +Event 192 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.280906177080156e+02 -4.243783337877665e+02 -5.241751390360782e+02 + 3 7.500000000000001e+02 3.280906177080155e+02 4.243783337877666e+02 5.241751390360782e+02 + ME 2.456724540584341e+00 + +Event 193 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.030479677918753e+02 3.086657427947369e+02 6.126886553888426e+02 + 3 7.500000000000000e+02 3.030479677918753e+02 -3.086657427947368e+02 -6.126886553888426e+02 + ME 3.504352033847108e+00 + +Event 194 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.646170069729470e+02 3.428856457724426e+02 -6.122967201805791e+02 + 3 7.500000000000000e+02 2.646170069729470e+02 -3.428856457724426e+02 6.122967201805791e+02 + ME 3.496327512126989e+00 + +Event 195 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.753494363932361e+02 4.203961533313043e+02 -2.339660238287344e+02 + 3 7.499999999999999e+02 5.753494363932361e+02 -4.203961533313043e+02 2.339660238287342e+02 + ME 1.984639517527489e+00 + +Event 196 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -4.281251117808167e+02 -5.086770881665589e+02 3.470684494981563e+02 + 3 7.499999999999999e+02 4.281251117808166e+02 5.086770881665594e+02 -3.470684494981560e+02 + ME 2.004096818030677e+00 + +Event 197 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999991e+02 -8.339544953992255e+01 -6.784858681092462e+02 -3.085484172250403e+02 + 3 7.500000000000001e+02 8.339544953992252e+01 6.784858681092462e+02 3.085484172250404e+02 + ME 1.988754370649548e+00 + +Event 198 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 -8.306003478718596e+01 -8.401139414697868e+00 7.453391519822948e+02 + 3 7.499999999999989e+02 8.306003478718694e+01 8.401139414698259e+00 -7.453391519822936e+02 + ME 2.231860287132694e+01 + +Event 199 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.019646293152730e+02 3.128355820782189e+02 -6.510331689640138e+02 + 3 7.499999999999998e+02 2.019646293152729e+02 -3.128355820782187e+02 6.510331689640138e+02 + ME 4.591940058674536e+00 + +Event 200 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 6.426710110696489e+01 3.020308647655600e+02 -6.834815991994778e+02 + 3 7.500000000000000e+02 -6.426710110696466e+01 -3.020308647655600e+02 6.834815991994776e+02 + ME 6.359396539868402e+00 + +Event 201 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 4.022805093987975e+02 -5.266311217523920e+02 -3.511837885775218e+02 + 3 7.500000000000000e+02 -4.022805093987974e+02 5.266311217523918e+02 3.511837885775218e+02 + ME 2.006554767315016e+00 + +Event 202 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.007651650396935e+02 -3.709039186002585e+02 5.783343312182138e+02 + 3 7.500000000000001e+02 3.007651650396937e+02 3.709039186002584e+02 -5.783343312182138e+02 + ME 2.948288961372177e+00 + +Event 203 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.363438797848468e+01 7.118317541034084e+01 -7.458563423902847e+02 + 3 7.500000000000001e+02 3.363438797848469e+01 -7.118317541034084e+01 7.458563423902847e+02 + ME 2.276381800960898e+01 + +Event 204 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -7.528559343153034e+01 -6.321297640200866e+02 -3.965400873323798e+02 + 3 7.499999999999998e+02 7.528559343153087e+01 6.321297640200864e+02 3.965400873323802e+02 + ME 2.048108674862614e+00 + +Event 205 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -2.566959580289875e+02 5.032495686744614e+02 4.933021961845994e+02 + 3 7.500000000000003e+02 2.566959580289877e+02 -5.032495686744614e+02 -4.933021961845996e+02 + ME 2.293627718611544e+00 + +Event 206 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999986e+02 -5.149079042091845e+02 5.440418782066432e+02 3.725969592322746e+01 + 3 7.500000000000009e+02 5.149079042091842e+02 -5.440418782066431e+02 -3.725969592322733e+01 + ME 2.010036068339084e+00 + +Event 207 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 1.433423326606743e+02 7.409449380754376e+01 7.324363328336467e+02 + 3 7.499999999999998e+02 -1.433423326606745e+02 -7.409449380754376e+01 -7.324363328336465e+02 + ME 1.489028843267540e+01 + +Event 208 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.842573371112484e+01 7.342148980903337e+02 1.481618925735034e+02 + 3 7.499999999999995e+02 -3.842573371112481e+01 -7.342148980903333e+02 -1.481618925735034e+02 + ME 1.996624231741946e+00 + +Event 209 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999987e+02 6.901829459970380e+02 -7.313618628183488e+01 2.842509442570104e+02 + 3 7.500000000000003e+02 -6.901829459970360e+02 7.313618628183590e+01 -2.842509442570093e+02 + ME 1.984622606725810e+00 + +Event 210 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999987e+02 5.714997395800420e+02 4.794362694683772e+02 7.764606350735806e+01 + 3 7.499999999999993e+02 -5.714997395800417e+02 -4.794362694683778e+02 -7.764606350735664e+01 + ME 2.006645066069941e+00 + +Event 211 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -1.281705743857859e+02 6.553011438753589e+01 -7.360557777573476e+02 + 3 7.500000000000000e+02 1.281705743857858e+02 -6.553011438753575e+01 7.360557777573475e+02 + ME 1.644725159966691e+01 + +Event 212 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 -3.302472484430331e+02 -5.946768960171297e+02 3.159052773209696e+02 + 3 7.499999999999965e+02 3.302472484430314e+02 5.946768960171261e+02 -3.159052773209676e+02 + ME 1.990748546221434e+00 + +Event 213 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.589381160217067e+02 4.877683676965268e+02 -1.103186381930632e+02 + 3 7.500000000000047e+02 -5.589381160217123e+02 -4.877683676965237e+02 1.103186381930646e+02 + ME 2.002478515476537e+00 + +Event 214 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000036e+02 -5.489994742235585e+02 5.022963490456322e+02 9.379741487743676e+01 + 3 7.499999999999898e+02 5.489994742235668e+02 -5.022963490456343e+02 -9.379741487743821e+01 + ME 2.004720570188235e+00 + +Event 215 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 5.451972808110462e+02 -4.842724140540300e+02 1.753287026773505e+02 + 3 7.499999999999995e+02 -5.451972808110462e+02 4.842724140540300e+02 -1.753287026773504e+02 + ME 1.992219833660235e+00 + +Event 216 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.351512110674362e+02 -6.344873209566202e+02 -2.181731084834278e+02 + 3 7.500000000000000e+02 3.351512110674362e+02 6.344873209566204e+02 2.181731084834278e+02 + ME 1.986182644490094e+00 + +Event 217 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000055e+02 2.256146404784491e+02 -3.672666490292709e+02 6.137696982684937e+02 + 3 7.500000000000331e+02 -2.256146404784360e+02 3.672666490292730e+02 -6.137696982684606e+02 + ME 3.526732459702814e+00 + +Event 218 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.145068310337325e+02 3.208524589836873e+02 -4.414027870140238e+02 + 3 7.500000000000002e+02 -5.145068310337326e+02 -3.208524589836873e+02 4.414027870140238e+02 + ME 2.126198299050428e+00 + +Event 219 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.924092255386964e+02 2.606260561841842e+02 1.230590233456330e+02 + 3 7.500000000000002e+02 6.924092255386963e+02 -2.606260561841842e+02 -1.230590233456330e+02 + ME 2.000595763661263e+00 + +Event 220 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000059e+02 1.410851717299414e+02 -1.722511762646131e+00 -7.366084490902573e+02 + 3 7.499999999999997e+02 -1.410851717299427e+02 1.722511762645365e+00 7.366084490902609e+02 + ME 1.671255294535188e+01 + +Event 221 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -4.216972323744531e+02 -5.474264157117908e+02 2.915403292662405e+02 + 3 7.499999999999997e+02 4.216972323744527e+02 5.474264157117910e+02 -2.915403292662398e+02 + ME 1.985504218638223e+00 + +Event 222 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -5.703291911524502e+02 2.159931833959635e+02 4.365450245345482e+02 + 3 7.500000000000001e+02 5.703291911524508e+02 -2.159931833959635e+02 -4.365450245345479e+02 + ME 2.115365223704068e+00 + +Event 223 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -4.653699780465835e+02 5.516351728100433e+02 -2.040328886521882e+02 + 3 7.499999999999999e+02 4.653699780465836e+02 -5.516351728100433e+02 2.040328886521882e+02 + ME 1.987944959130031e+00 + +Event 224 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.340910046878815e+02 -5.773211279348985e+02 4.176143158037027e+02 + 3 7.499999999999998e+02 2.340910046878812e+02 5.773211279348985e+02 -4.176143158037027e+02 + ME 2.079067375569363e+00 + +Event 225 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.759434290522928e+02 6.827868599275704e+02 2.556286636579899e+02 + 3 7.500000000000007e+02 -1.759434290522934e+02 -6.827868599275705e+02 -2.556286636579902e+02 + ME 1.983533512495551e+00 + +Event 226 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.045452037729306e+02 -4.263792434151545e+02 1.233930199803324e+02 + 3 7.500000000000003e+02 6.045452037729307e+02 4.263792434151545e+02 -1.233930199803322e+02 + ME 2.000544909078705e+00 + +Event 227 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.413532699307888e+02 4.584643726767907e+02 -2.434071817373498e+02 + 3 7.499999999999997e+02 -5.413532699307889e+02 -4.584643726767907e+02 2.434071817373500e+02 + ME 1.983992565736947e+00 + +Event 228 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.451497175256272e+01 -1.575609542562009e+02 -7.331192681200635e+02 + 3 7.499999999999998e+02 -1.451497175256276e+01 1.575609542562010e+02 7.331192681200635e+02 + ME 1.516217829504346e+01 + +Event 229 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.976075277594232e+02 5.994840638922487e+02 2.121959259935821e+02 + 3 7.500000000000007e+02 3.976075277594233e+02 -5.994840638922487e+02 -2.121959259935820e+02 + ME 1.986890198024315e+00 + +Event 230 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.454994669319925e+02 6.424027082758220e+02 -1.744960708715188e+02 + 3 7.500000000000001e+02 -3.454994669319923e+02 -6.424027082758221e+02 1.744960708715187e+02 + ME 1.992352423518112e+00 + +Event 231 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.709700188990096e+01 -9.588670170606436e+01 -7.416506406733005e+02 + 3 7.500000000000001e+02 -5.709700188990106e+01 9.588670170606436e+01 7.416506406733004e+02 + ME 1.956832032390851e+01 + +Event 232 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -6.095841334961023e+02 1.049630507684877e+02 4.241343444749285e+02 + 3 7.499999999999999e+02 6.095841334961023e+02 -1.049630507684882e+02 -4.241343444749288e+02 + ME 2.090570503289665e+00 + +Event 233 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.540619742696694e+02 6.247200054612367e+02 2.164833322747417e+02 + 3 7.500000000000000e+02 -3.540619742696694e+02 -6.247200054612367e+02 -2.164833322747417e+02 + ME 1.986376590203003e+00 + +Event 234 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 5.021981849113407e+02 4.737027333003317e+02 -2.930916299308989e+02 + 3 7.500000000000007e+02 -5.021981849113405e+02 -4.737027333003314e+02 2.930916299308989e+02 + ME 1.985728923505389e+00 + +Event 235 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.462818585976497e+02 -2.802589959482593e+02 6.506152021861775e+02 + 3 7.499999999999997e+02 2.462818585976497e+02 2.802589959482593e+02 -6.506152021861776e+02 + ME 4.575826699951948e+00 + +Event 236 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -7.345429587235462e+02 -4.314359541629803e+01 -1.452076856237854e+02 + 3 7.500000000000018e+02 7.345429587235461e+02 4.314359541629804e+01 1.452076856237852e+02 + ME 1.997103115301681e+00 + +Event 237 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -2.154628275024254e+02 5.951093556116026e+02 -4.023936192687488e+02 + 3 7.499999999999998e+02 2.154628275024254e+02 -5.951093556116025e+02 4.023936192687490e+02 + ME 2.055826393864322e+00 + +Event 238 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -5.866932570143744e+01 -7.232769026327444e+02 -1.895479684399756e+02 + 3 7.499999999999997e+02 5.866932570143744e+01 7.232769026327443e+02 1.895479684399756e+02 + ME 1.990015734098380e+00 + +Event 239 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -6.560694650821692e+02 1.084032761720732e+02 -3.468740213708119e+02 + 3 7.499999999999998e+02 6.560694650821692e+02 -1.084032761720729e+02 3.468740213708119e+02 + ME 2.003985174864813e+00 + +Event 240 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -4.455880632469494e+02 -5.785873576156609e+02 1.708448053034081e+02 + 3 7.499999999999998e+02 4.455880632469495e+02 5.785873576156611e+02 -1.708448053034081e+02 + ME 1.992937128503243e+00 + +Event 241 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.940553859063193e+02 5.948775697686276e+02 3.495026566503573e+02 + 3 7.499999999999999e+02 -2.940553859063192e+02 -5.948775697686275e+02 -3.495026566503571e+02 + ME 2.005528586269665e+00 + +Event 242 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.255013042950073e+02 3.893831803342041e+02 1.401030235150752e+02 + 3 7.500000000000005e+02 6.255013042950073e+02 -3.893831803342039e+02 -1.401030235150752e+02 + ME 1.997925520851328e+00 + +Event 243 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.768430196426253e+02 -8.702513188290250e+01 4.713584168234838e+02 + 3 7.500000000000002e+02 5.768430196426256e+02 8.702513188290267e+01 -4.713584168234838e+02 + ME 2.209730179651860e+00 + +Event 244 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.570848048302291e+02 6.678520603231962e+02 -3.029818305006274e+02 + 3 7.499999999999999e+02 1.570848048302291e+02 -6.678520603231962e+02 3.029818305006275e+02 + ME 1.987492291155503e+00 + +Event 245 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999959e+02 -3.619595558326033e+02 -3.635357995317544e+02 5.471078526216538e+02 + 3 7.500000000000020e+02 3.619595558326021e+02 3.635357995317536e+02 -5.471078526216538e+02 + ME 2.624749251730370e+00 + +Event 246 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.171130913584885e+02 6.858303858260187e+01 -6.195264609652618e+02 + 3 7.500000000000000e+02 -4.171130913584882e+02 -6.858303858260200e+01 6.195264609652623e+02 + ME 3.652313954526031e+00 + +Event 247 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 2.343959954156197e+02 3.550474070294902e+02 6.176567461744072e+02 + 3 7.499999999999997e+02 -2.343959954156194e+02 -3.550474070294902e+02 -6.176567461744072e+02 + ME 3.610306343463637e+00 + +Event 248 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 5.858635267134600e+02 3.455507392677969e+02 -3.160041370905555e+02 + 3 7.499999999999992e+02 -5.858635267134599e+02 -3.455507392677966e+02 3.160041370905555e+02 + ME 1.990778004173873e+00 + +Event 249 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 7.056780166938408e+02 2.349912780195325e+02 -9.642425011268841e+01 + 3 7.500000000000002e+02 -7.056780166938408e+02 -2.349912780195324e+02 9.642425011268844e+01 + ME 2.004381269152324e+00 + +Event 250 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.004573993267173e+02 5.235762666244966e+01 7.413952993179738e+02 + 3 7.500000000000002e+02 -1.004573993267173e+02 -5.235762666244980e+01 -7.413952993179738e+02 + ME 1.940169038103634e+01 + +Event 251 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000024e+02 -6.475270246624093e+02 1.104657610733046e+02 -3.619476038908663e+02 + 3 7.499999999999998e+02 6.475270246624115e+02 -1.104657610733052e+02 3.619476038908668e+02 + ME 2.013889007952173e+00 + +Event 252 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.030430118392510e+02 3.645899530085105e+02 4.201450873269225e+02 + 3 7.500000000000001e+02 5.030430118392510e+02 -3.645899530085106e+02 -4.201450873269225e+02 + ME 2.083415027886701e+00 + +Event 253 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000009e+02 -5.256865986862222e+02 -4.330514916902604e+02 3.140382230024754e+02 + 3 7.500000000000000e+02 5.256865986862221e+02 4.330514916902599e+02 -3.140382230024749e+02 + ME 1.990205740204019e+00 + +Event 254 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000186e+02 9.702918632485427e+01 6.245997683844753e+02 -4.036836215839905e+02 + 3 7.499999999999869e+02 -9.702918632485657e+01 -6.245997683844794e+02 4.036836215839967e+02 + ME 2.057613877988064e+00 + +Event 255 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -5.376163199361313e+01 7.337003421695744e+02 1.459229071312400e+02 + 3 7.500000000000000e+02 5.376163199361314e+01 -7.337003421695744e+02 -1.459229071312400e+02 + ME 1.996987343463174e+00 + From 717a5deaffd3f6c9fda58d57629fa817cf8b91fe Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 08:42:19 +0100 Subject: [PATCH 14/96] [susy2] in susy_gg_tt.sa Parameters_MSSM_SLHA2.h, temporarely(?) comment out "#error only HRDCOD=1 is supported" 'make HRDCOD=0' fails with ccache /cvmfs/sft.cern.ch/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/bin/g++ -O3 -std=c++17 -I. -fPIC -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -fPIC -c Parameters_MSSM_SLHA2.cc -o Parameters_MSSM_SLHA2.o In file included from Parameters_MSSM_SLHA2.cc:15: Parameters_MSSM_SLHA2.h: In function 'const mg5amcCpu::Parameters_MSSM_SLHA2_dependentCouplings::DependentCouplings_sv mg5amcCpu::Parameters_MSSM_SLHA2_dependentCouplings::computeDependentCouplings_fromG(const mg5amcCpu::fptype_sv&)': Parameters_MSSM_SLHA2.h:842:33: error: 'mdl_I51x11' was not declared in this scope 842 | out.GC_51 = -( cI * G * mdl_I51x11 ); | ^~~~~~~~~~ Parameters_MSSM_SLHA2.cc: In member function 'void mg5amcCpu::Parameters_MSSM_SLHA2::setIndependentParameters(SLHAReader&)': Parameters_MSSM_SLHA2.cc:80:3: error: 'indices' was not declared in this scope 80 | indices[0] = 3; | ^~~~~~~ make[1]: *** [cudacpp_src.mk:251: Parameters_MSSM_SLHA2.o] Error 1 --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 80be641274..446fedc86c 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -23,7 +23,7 @@ //========================================================================== #ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) -#error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1" +//#error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1" #include "read_slha.h" From df8b87a9dd01eb23a010c43cf5f21097af0f3f5a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 08:53:26 +0100 Subject: [PATCH 15/96] [susy2] in susyggtt.sa Parameters_MSSM_SLHA2.h, make Parameters_MSSM_SLHA2::getInstance()->mdl_I51x11 visible in HRDCOD=0 builds 'make HRDCOD=0' now fails with ccache /cvmfs/sft.cern.ch/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/bin/g++ -O3 -std=c++17 -I. -fPIC -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -fPIC -c Parameters_MSSM_SLHA2.cc -o Parameters_MSSM_SLHA2.o Parameters_MSSM_SLHA2.cc: In member function 'void mg5amcCpu::Parameters_MSSM_SLHA2::setIndependentParameters(SLHAReader&)': Parameters_MSSM_SLHA2.cc:80:3: error: 'indices' was not declared in this scope 80 | indices[0] = 3; | ^~~~~~~ --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 446fedc86c..16545bb7e2 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -824,6 +824,8 @@ namespace mg5amcCpu { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_MSSM_SLHA2; +#else + const double mdl_I51x11 = Parameters_MSSM_SLHA2::getInstance()->mdl_I51x11; // fix HRDCOD=0 susy builds #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_MSSM_SLHA2) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below From 4f3b83a0a3cea27a9784075eae17a48b27c2d368 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 08:58:41 +0100 Subject: [PATCH 16/96] [susy2] in susyggtt.sa Parameters_MSSM_SLHA2.cc, declare "indices" that was previously commented out 'make HRDCOD=0' now succeeds! however it gives a warning for cuda: ccache /usr/local/cuda-12.0/bin/nvcc -Xcompiler -O3 -lineinfo -I. -I../../src -I/usr/local/cuda-12.0/include/ -DUSE_NVTX -gencode arch=compute_70,code=compute_70 -gencode arch=compute_70,code=sm_70 -use_fast_math -std=c++17 -ccbin /cvmfs/sft.cern.ch/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/bin/g++ -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -Xcompiler -fPIC -c -x cu CPPProcess.cc -o CPPProcess_cu.o ../../src/Parameters_MSSM_SLHA2.h(828): warning #20011-D: calling a __host__ function("mg5amcGpu::Parameters_MSSM_SLHA2::getInstance()") from a __host__ __device__ function("mg5amcGpu::Parameters_MSSM_SLHA2_dependentCouplings::computeDependentCouplings_fromG") is not allowed ../../src/Parameters_MSSM_SLHA2.h(828): remark: The warnings can be suppressed with "-diag-suppress " The warning is probably very relevant because computed MEs are all 0 at runtime! At runtime, gcheck.exe gives 0 MEs. At runtime, runTest.exe fails because it gives 0 MEs. [----------] 1 test from SIGMA_MSSM_SLHA2_GG_TTX_GPU/MadgraphTest [ RUN ] SIGMA_MSSM_SLHA2_GG_TTX_GPU/MadgraphTest.CompareMomentaAndME/0 INFO: Opening reference file ../../test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt MadgraphTest.h:310: Failure The difference between testDriver->getMatrixElement( ievt ) and referenceData[iiter].MEs[ievt] is 2.0052779755903329, which exceeds toleranceMEs * referenceData[iiter].MEs[ievt], where testDriver->getMatrixElement( ievt ) evaluates to 0, referenceData[iiter].MEs[ievt] evaluates to 2.0052779755903329, and toleranceMEs * referenceData[iiter].MEs[ievt] evaluates to 2.0052779755903328e-06. Google Test trace: MadgraphTest.h:289: In comparing event 0 from iteration 0 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 ref0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 ref1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 2 7.500000000000001e+02 5.849331413473453e+02 -3.138365726669761e+02 -3.490842674916367e+02 ref2 7.500000000000000e+02 5.849331413473452e+02 -3.138365726669761e+02 -3.490842674916366e+02 3 7.500000000000002e+02 -5.849331413473453e+02 3.138365726669762e+02 3.490842674916365e+02 ref3 7.500000000000001e+02 -5.849331413473452e+02 3.138365726669761e+02 3.490842674916364e+02 ME 0.000000000000000e+00 r.ME 2.005277975590333e+00 [ FAILED ] SIGMA_MSSM_SLHA2_GG_TTX_GPU/MadgraphTest.CompareMomentaAndME/0, where GetParam() = 0x2074830 (29 ms) NB: the problems above only concern CUDA. For C++-only builds, HRDCOD=0 builds now fully succeed After `CUDA_HOME=none make HRDCOD=0 -j`, both check.exe and runTest.exe succeed at runtime. This means that C++ builds are now essentially ok for both HRDCOD=0 and HRDCOD=1. The code needs some cleanup however and a backport to code generation, with cross-tests for other processes. --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc index 88e937627f..7de0b3ed3a 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc @@ -42,7 +42,7 @@ Parameters_MSSM_SLHA2::setIndependentParameters( SLHAReader& slha ) { zero = 0; // define "zero" ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + std::vector indices(2, 0); // prepare a vector for indices mdl_Wsl6 = slha.get_block_entry( "decay", 2000015, 2.699061e-01 ); mdl_Wsl5 = slha.get_block_entry( "decay", 2000013, 2.161216e-01 ); mdl_Wsl4 = slha.get_block_entry( "decay", 2000011, 2.161216e-01 ); From 3295142e3ddf1f7091a7f7d2fd2494ec413283a8 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 17:02:35 +0100 Subject: [PATCH 17/96] [susy2] in susyggtt.sa, fix HRDCOD=0 cuda tests by including mdl_I51x11 in cIPD on device memory 'make HRDCOD=0' now succeeds and gives no warning. Both CPU and GPU tests succeed in the combined runTest.exe. Both check.exe and gcheck.exe succeed and give the same ME as each other. (And they give the same ME as that from check.exe in 'CUDA_HOME make HRDCOD=1' if common random are used). The other pending problem now is HRDCOD=1 builds in CUDA, where no constexpr sin/cos/atan are available. Note: all tests so far were done with FPTYPE=d. Builds with f and m precision still need to be fully tested and fixed. 'make HRDCOD=0 FPTYPE=f' builds fail: ccache /cvmfs/sft.cern.ch/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/bin/g++ -O3 -std=c++17 -I. -fPIC -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT -fPIC -c Parameters_MSSM_SLHA2.cc -o Parameters_MSSM_SLHA2.o In file included from Parameters_MSSM_SLHA2.cc:15: Parameters_MSSM_SLHA2.h: In function 'const mg5amcCpu::Parameters_MSSM_SLHA2_dependentCouplings::DependentCouplings_sv mg5amcCpu::Parameters_MSSM_SLHA2_dependentCouplings::computeDependentCouplings_fromG(const mg5amcCpu::fptype_sv&, const mgOnGpu::fptype*)': Parameters_MSSM_SLHA2.h:861:66: error: the value of 'G' is not usable in a constant expression 861 | constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); | ^ Parameters_MSSM_SLHA2.h:857:23: note: 'G' was not initialized with a constant expression 857 | const fptype& G = G_sv[i]; | ^ --- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 18 +++++++++--------- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 10 ++++++---- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index 909f063728..0651b57663 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -76,14 +76,14 @@ namespace mg5amcCpu // However, physics parameters are user-defined through card files: use CUDA constant memory instead (issue #39) // [NB if hardcoded parameters are used, it's better to define them here to avoid silent shadowing (issue #263)] #ifdef MGONGPU_HARDCODE_PARAM - __device__ const fptype cIPD[2] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT }; + __device__ const fptype cIPD[3] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT, (fptype)Parameters_MSSM_SLHA2::mdl_I51x11 }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else #ifdef MGONGPUCPP_GPUIMPL - __device__ __constant__ fptype cIPD[2]; + __device__ __constant__ fptype cIPD[3]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else - static fptype cIPD[2]; + static fptype cIPD[3]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif #endif @@ -502,16 +502,16 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory - const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; + const fptype tIPD[3] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT, (fptype)m_pars->mdl_I51x11 }; //const cxtype tIPC[0] = { ... }; // nicoup=0 #ifdef MGONGPUCPP_GPUIMPL - gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + gpuMemcpyToSymbol( cIPD, tIPD, 3 * sizeof( fptype ) ); //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else - memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); + memcpy( cIPD, tIPD, 3 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( i=0; i<3; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -622,7 +622,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, cIPD ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -632,7 +632,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, cIPD ); } #endif } diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 16545bb7e2..5a15b48e0c 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -820,12 +820,13 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* cIPD ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_MSSM_SLHA2; #else - const double mdl_I51x11 = Parameters_MSSM_SLHA2::getInstance()->mdl_I51x11; // fix HRDCOD=0 susy builds + //const double mdl_I51x11 = Parameters_MSSM_SLHA2::getInstance()->mdl_I51x11; // fix HRDCOD=0 susy builds + const fptype mdl_I51x11 = cIPD[2]; // fix HRDCOD=0 susy builds #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_MSSM_SLHA2) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -896,12 +897,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* cIPD ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_MSSM_SLHA2_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, cIPD ); fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); fptype* GC_51s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 ); cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s ); From 94408b8de8a4ac160cf3af7a944e5e815f61f50f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 17:29:38 +0100 Subject: [PATCH 18/96] [susy2] in susyggtt.sa, test an alternative approach to fix HRDCOD=0 cuda tests (will be reverted, committed for the record) (This also works but is an overkill, moving a lot of code from Parameters_MSSM_SLHA2.h to CPPProcess.cc. I developed this because I had introduced a silly bug in the previous simpler approach, which I then fixed.) --- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 108 +++++++++++++++++- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 11 +- 2 files changed, 114 insertions(+), 5 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index 0651b57663..84c587ffdf 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -101,6 +101,110 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- + namespace Parameters_MSSM_SLHA2_dependentCouplings + { + constexpr size_t idcoup_GC_6 = 0; + constexpr size_t idcoup_GC_51 = 1; + struct DependentCouplings_sv + { + cxtype_sv GC_6; + cxtype_sv GC_51; + }; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#ifdef MGONGPUCPP_GPUIMPL +#pragma nv_diagnostic push +#pragma nv_diag_suppress 177 // e.g. <> +#endif + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + { +#ifdef MGONGPU_HARDCODE_PARAM + using namespace Parameters_MSSM_SLHA2; +#else + //const double mdl_I51x11 = Parameters_MSSM_SLHA2::getInstance()->mdl_I51x11; // fix HRDCOD=0 susy builds + const fptype mdl_I51x11 = cIPD[2]; // fix HRDCOD=0 susy builds +#endif + // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_MSSM_SLHA2) because: + // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below + const cxtype cI( 0., 1. ); + DependentCouplings_sv out; + // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) +#if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT ) + { + const fptype_sv& G = G_sv; + // Model parameters dependent on aS + //const fptype_sv mdl_sqrt__aS = constexpr_sqrt( aS ); + //const fptype_sv G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); + const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); + // Model couplings dependent on aS + out.GC_6 = -G; + out.GC_51 = -( cI * G * mdl_I51x11 ); + } +#else + // ** NB #439: special handling is necessary ONLY FOR VECTORS OF FLOATS (variable Gs are vector floats, fixed parameters are scalar doubles) + // Use an explicit loop to avoid <> + // Problems may come e.g. in EFTs from multiplying a vector float (related to aS-dependent G) by a scalar double (aS-independent parameters) + fptype_v GC_6r_v; + fptype_v GC_6i_v; + fptype_v GC_51r_v; + fptype_v GC_51i_v; + for( int i = 0; i < neppV; i++ ) + { + const fptype& G = G_sv[i]; + // Model parameters dependent on aS + //const fptype mdl_sqrt__aS = constexpr_sqrt( aS ); + //const fptype G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); + constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); + // Model couplings dependent on aS + const cxtype GC_6 = -G; + const cxtype GC_51 = -( cI * G * mdl_I51x11 ); + GC_6r_v[i] = cxreal( GC_6 ); + GC_6i_v[i] = cximag( GC_6 ); + GC_51r_v[i] = cxreal( GC_51 ); + GC_51i_v[i] = cximag( GC_51 ); + } + out.GC_6 = cxtype_v( GC_6r_v, GC_6i_v ); + out.GC_51 = cxtype_v( GC_51r_v, GC_51i_v ); +#endif + // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) + return out; + } +#ifdef MGONGPUCPP_GPUIMPL +#pragma GCC diagnostic pop +#pragma nv_diagnostic pop +#endif + } + + //========================================================================== + +#pragma GCC diagnostic push +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> +#endif + // Compute the output couplings (e.g. gc10 and gc11) from the input gs + template + __device__ inline void + G2COUP( const fptype gs[], + fptype couplings[] ) + { + mgDebug( 0, __FUNCTION__ ); + using namespace Parameters_MSSM_SLHA2_dependentCouplings; + const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); + fptype* GC_51s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 ); + cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s ); + cxtype_sv_ref GC_51s_sv = C_ACCESS::kernelAccess( GC_51s ); + GC_6s_sv = couplings_sv.GC_6; + GC_51s_sv = couplings_sv.GC_51; + mgDebug( 1, __FUNCTION__ ); + return; + } +#pragma GCC diagnostic pop + + //-------------------------------------------------------------------------- + // Evaluate |M|^2 for each subprocess // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) @@ -622,7 +726,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, cIPD ); + G2COUP( allgs, allcouplings ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -632,7 +736,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, cIPD ); + G2COUP( gs, couplings ); } #endif } diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 5a15b48e0c..dc7ea766d1 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -806,6 +806,11 @@ namespace mg5amcCpu namespace Parameters_MSSM_SLHA2_dependentCouplings { constexpr size_t ndcoup = 2; // #couplings that vary event by event because they depend on the running alphas QCD + } + + /* + namespace Parameters_MSSM_SLHA2_dependentCouplings + { constexpr size_t idcoup_GC_6 = 0; constexpr size_t idcoup_GC_51 = 1; struct DependentCouplings_sv @@ -878,7 +883,7 @@ namespace mg5amcCpu #pragma nv_diagnostic pop #endif } - + */ //========================================================================== namespace Parameters_MSSM_SLHA2_independentCouplings @@ -888,7 +893,7 @@ namespace mg5amcCpu } //========================================================================== - + /* #pragma GCC diagnostic push #ifndef __clang__ #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> @@ -914,7 +919,7 @@ namespace mg5amcCpu return; } #pragma GCC diagnostic pop - + */ } // end namespace mg5amcGpu/mg5amcCpu //========================================================================== From e4bf343fb22e5191d5ebf9682e8f90dfc8eb3d27 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 18:28:27 +0100 Subject: [PATCH 19/96] [susy2] in susyggtt.sa, revert the previous commit with an alternative approach This reverts commit 94408b8de8a4ac160cf3af7a944e5e815f61f50f. --- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 108 +----------------- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 11 +- 2 files changed, 5 insertions(+), 114 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index 84c587ffdf..0651b57663 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -101,110 +101,6 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- - namespace Parameters_MSSM_SLHA2_dependentCouplings - { - constexpr size_t idcoup_GC_6 = 0; - constexpr size_t idcoup_GC_51 = 1; - struct DependentCouplings_sv - { - cxtype_sv GC_6; - cxtype_sv GC_51; - }; -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef MGONGPUCPP_GPUIMPL -#pragma nv_diagnostic push -#pragma nv_diag_suppress 177 // e.g. <> -#endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) - { -#ifdef MGONGPU_HARDCODE_PARAM - using namespace Parameters_MSSM_SLHA2; -#else - //const double mdl_I51x11 = Parameters_MSSM_SLHA2::getInstance()->mdl_I51x11; // fix HRDCOD=0 susy builds - const fptype mdl_I51x11 = cIPD[2]; // fix HRDCOD=0 susy builds -#endif - // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_MSSM_SLHA2) because: - // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below - const cxtype cI( 0., 1. ); - DependentCouplings_sv out; - // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) -#if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT ) - { - const fptype_sv& G = G_sv; - // Model parameters dependent on aS - //const fptype_sv mdl_sqrt__aS = constexpr_sqrt( aS ); - //const fptype_sv G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); - // Model couplings dependent on aS - out.GC_6 = -G; - out.GC_51 = -( cI * G * mdl_I51x11 ); - } -#else - // ** NB #439: special handling is necessary ONLY FOR VECTORS OF FLOATS (variable Gs are vector floats, fixed parameters are scalar doubles) - // Use an explicit loop to avoid <> - // Problems may come e.g. in EFTs from multiplying a vector float (related to aS-dependent G) by a scalar double (aS-independent parameters) - fptype_v GC_6r_v; - fptype_v GC_6i_v; - fptype_v GC_51r_v; - fptype_v GC_51i_v; - for( int i = 0; i < neppV; i++ ) - { - const fptype& G = G_sv[i]; - // Model parameters dependent on aS - //const fptype mdl_sqrt__aS = constexpr_sqrt( aS ); - //const fptype G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); - // Model couplings dependent on aS - const cxtype GC_6 = -G; - const cxtype GC_51 = -( cI * G * mdl_I51x11 ); - GC_6r_v[i] = cxreal( GC_6 ); - GC_6i_v[i] = cximag( GC_6 ); - GC_51r_v[i] = cxreal( GC_51 ); - GC_51i_v[i] = cximag( GC_51 ); - } - out.GC_6 = cxtype_v( GC_6r_v, GC_6i_v ); - out.GC_51 = cxtype_v( GC_51r_v, GC_51i_v ); -#endif - // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) - return out; - } -#ifdef MGONGPUCPP_GPUIMPL -#pragma GCC diagnostic pop -#pragma nv_diagnostic pop -#endif - } - - //========================================================================== - -#pragma GCC diagnostic push -#ifndef __clang__ -#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> -#endif - // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template - __device__ inline void - G2COUP( const fptype gs[], - fptype couplings[] ) - { - mgDebug( 0, __FUNCTION__ ); - using namespace Parameters_MSSM_SLHA2_dependentCouplings; - const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); - fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); - fptype* GC_51s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 ); - cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s ); - cxtype_sv_ref GC_51s_sv = C_ACCESS::kernelAccess( GC_51s ); - GC_6s_sv = couplings_sv.GC_6; - GC_51s_sv = couplings_sv.GC_51; - mgDebug( 1, __FUNCTION__ ); - return; - } -#pragma GCC diagnostic pop - - //-------------------------------------------------------------------------- - // Evaluate |M|^2 for each subprocess // NB: calculate_wavefunctions ADDS |M|^2 for a given ihel to the running sum of |M|^2 over helicities for the given event(s) // (similarly, it also ADDS the numerator and denominator for a given ihel to their running sums over helicities) @@ -726,7 +622,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, cIPD ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -736,7 +632,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, cIPD ); } #endif } diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index dc7ea766d1..5a15b48e0c 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -806,11 +806,6 @@ namespace mg5amcCpu namespace Parameters_MSSM_SLHA2_dependentCouplings { constexpr size_t ndcoup = 2; // #couplings that vary event by event because they depend on the running alphas QCD - } - - /* - namespace Parameters_MSSM_SLHA2_dependentCouplings - { constexpr size_t idcoup_GC_6 = 0; constexpr size_t idcoup_GC_51 = 1; struct DependentCouplings_sv @@ -883,7 +878,7 @@ namespace mg5amcCpu #pragma nv_diagnostic pop #endif } - */ + //========================================================================== namespace Parameters_MSSM_SLHA2_independentCouplings @@ -893,7 +888,7 @@ namespace mg5amcCpu } //========================================================================== - /* + #pragma GCC diagnostic push #ifndef __clang__ #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> @@ -919,7 +914,7 @@ namespace mg5amcCpu return; } #pragma GCC diagnostic pop - */ + } // end namespace mg5amcGpu/mg5amcCpu //========================================================================== From ac1843c7bbfc230c3b8dd6fcd939ff73ecd23995 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 18:32:36 +0100 Subject: [PATCH 20/96] [susy2] in susyggtt.sa Parameters.h, change mdl_G__exp__2 from constexpr to const also for FPTYPE=f 'make HRDCOD=0 FPTYPE=f' now fails with: ccache /cvmfs/sft.cern.ch/lcg/releases/gcc/12.1.0-57c96/x86_64-centos9/bin/g++ -O3 -std=c++17 -I. -fPIC -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT -fPIC -c Parameters_MSSM_SLHA2.cc -o Parameters_MSSM_SLHA2.o In file included from Parameters_MSSM_SLHA2.cc:15: Parameters_MSSM_SLHA2.h: In function 'const mg5amcCpu::Parameters_MSSM_SLHA2_dependentCouplings::DependentCouplings_sv mg5amcCpu::Parameters_MSSM_SLHA2_dependentCouplings::computeDependentCouplings_fromG(const mg5amcCpu::fptype_sv&, const mgOnGpu::fptype*)': Parameters_MSSM_SLHA2.h:861:49: error: cannot convert 'mgOnGpu::fptype' {aka 'float'} to 'const mg5amcCpu::fptype_sv' {aka 'const __vector(8) float'} in initialization 861 | const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); | ~~~~~~~~^~~~~~~~~ | | | mgOnGpu::fptype {aka float} --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 5a15b48e0c..62ae686210 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -858,7 +858,7 @@ namespace mg5amcCpu // Model parameters dependent on aS //const fptype mdl_sqrt__aS = constexpr_sqrt( aS ); //const fptype G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); + const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); // Model couplings dependent on aS const cxtype GC_6 = -G; const cxtype GC_51 = -( cI * G * mdl_I51x11 ); From 94f61a63ded358dc42433ec041182637e794b8fa Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 14 Feb 2024 18:35:56 +0100 Subject: [PATCH 21/96] [susy2] in susyggtt.sa Parameters.h, use fptype instead of fptype_sv for mdl_G__exp__2 in FPTYPE=f 'make HRDCOD=0 FPTYPE=f -j' builds now succeed and the tests succeed too. 'make HRDCOD=0 FPTYPE=m -j' builds now also succeed and the tests succeed too. 'CUDA_HOME=none make HRDCOD=1' builds and tests also succeed with FPTYPE=d,f,m The only pending problem are CUDA HRDCOD=1 builds because of missing constexpr sin/cos/atan (#627) This is probably ready to be backported to CODEGEN and eventually merged --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 62ae686210..0702582a06 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -858,7 +858,7 @@ namespace mg5amcCpu // Model parameters dependent on aS //const fptype mdl_sqrt__aS = constexpr_sqrt( aS ); //const fptype G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); + const fptype mdl_G__exp__2 = ( ( G ) * ( G ) ); // Model couplings dependent on aS const cxtype GC_6 = -G; const cxtype GC_51 = -( cI * G * mdl_I51x11 ); From c4f046531d2f675a9b5b6c0898c445940ea9b116 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 27 Feb 2024 11:31:44 +0100 Subject: [PATCH 22/96] [susy2] in CODEGEN (backport from susyggtt.sa) add constexpr to cxsmmpl::conj function (Question: this is now valid code? In the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") --- .../madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); From a7bf616b78f96070211d975076825c5877cc33df Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 27 Feb 2024 11:46:32 +0100 Subject: [PATCH 23/96] [susy2] in CODEGEN, backport susy_gg_tt.sa Parameters_MSSM_SLHA2.h: comment out "#error only HRDCOD=1 is supported" and add a comment about non-SM physics processes --- .../cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index de4d28ad16..b75e8a3eaf 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -893,7 +893,7 @@ def super_generate_parameters_class_files(self): replace_dict['eftspecial1'] = ' // Begin SM implementation - no special handling of vectors of floats as in EFT (#439)' replace_dict['eftspecial2'] = ' // End SM implementation - no special handling of vectors of floats as in EFT (#439)' else: - replace_dict['efterror'] = '\n#error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1"' + replace_dict['efterror'] = '\n// WARNING! Support for non-SM physics processes is still limited (see PR #625)\n//#error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1"' replace_dict['eftspecial1'] = ' // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)' replace_dict['eftspecial1'] += '\n#if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT )' replace_dict['eftspecial2'] = """#else From a28025ef49e93d2f7bda3b792d93e238576433a8 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 27 Feb 2024 12:27:04 +0100 Subject: [PATCH 24/96] [susy2] in susyggtt.sa Parameters.h, use cxtype[_sv] for mdl_G__exp__2 instead of fptype[_sv], to ease backport to code generation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Note, this implies using cxsml which is a class and not a type, hence a different warning is issues and must be protected against "warning: variable ‘mdl_G__exp__2’ set but not used [-Wunused-but-set-variable]" --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 0702582a06..c54b16fe5c 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -814,8 +814,9 @@ namespace mg5amcCpu cxtype_sv GC_51; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> @@ -839,7 +840,7 @@ namespace mg5amcCpu // Model parameters dependent on aS //const fptype_sv mdl_sqrt__aS = constexpr_sqrt( aS ); //const fptype_sv G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); + const cxtype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); // Model couplings dependent on aS out.GC_6 = -G; out.GC_51 = -( cI * G * mdl_I51x11 ); @@ -858,7 +859,7 @@ namespace mg5amcCpu // Model parameters dependent on aS //const fptype mdl_sqrt__aS = constexpr_sqrt( aS ); //const fptype G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - const fptype mdl_G__exp__2 = ( ( G ) * ( G ) ); + const cxtype mdl_G__exp__2 = ( ( G ) * ( G ) ); // Model couplings dependent on aS const cxtype GC_6 = -G; const cxtype GC_51 = -( cI * G * mdl_I51x11 ); From 42141b35322318d32a3e66d0dcd7ac6ddb443a49 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 27 Feb 2024 14:15:41 +0100 Subject: [PATCH 25/96] [susy2] in CODEGEN, add reference log file for susy_gg_tt --- .../dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt | 3584 +++++++++++++++++ 1 file changed, 3584 insertions(+) create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt new file mode 100644 index 0000000000..fe68e0d3cb --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttx.txt @@ -0,0 +1,3584 @@ +Event 0 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 5.849331413473452e+02 -3.138365726669761e+02 -3.490842674916366e+02 + 3 7.500000000000001e+02 -5.849331413473452e+02 3.138365726669761e+02 3.490842674916364e+02 + ME 2.005277975590333e+00 + +Event 1 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.810632950825978e+01 -7.201507372976420e+02 -2.038840274050557e+02 + 3 7.499999999999995e+02 -4.810632950825982e+01 7.201507372976420e+02 2.038840274050556e+02 + ME 1.987965040619574e+00 + +Event 2 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -6.648646621266247e+02 -9.844173672211535e+01 -3.328125681616957e+02 + 3 7.500000000000001e+02 6.648646621266247e+02 9.844173672211555e+01 3.328125681616955e+02 + ME 1.996913845057977e+00 + +Event 3 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.840443703260547e+02 2.880181894591821e+02 -6.315570585677355e+02 + 3 7.500000000000003e+02 -2.840443703260547e+02 -2.880181894591822e+02 6.315570585677355e+02 + ME 3.954303259991462e+00 + +Event 4 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.068110730975250e+02 -7.417834499166065e+02 2.913259503670260e+01 + 3 7.500000000000009e+02 -1.068110730975250e+02 7.417834499166063e+02 -2.913259503670238e+01 + ME 2.010443642097316e+00 + +Event 5 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 7.488894260747408e+02 3.183002578419756e+01 2.552404693662126e+01 + 3 7.500000000000002e+02 -7.488894260747409e+02 -3.183002578419794e+01 -2.552404693662112e+01 + ME 2.010593307151333e+00 + +Event 6 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 1.342046371107455e+02 6.997714234797991e+02 2.341133705259679e+02 + 3 7.500000000000008e+02 -1.342046371107465e+02 -6.997714234797999e+02 -2.341133705259673e+02 + ME 1.984627685022313e+00 + +Event 7 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 7.437674050373496e+02 -9.311360962031367e+01 -2.529630224920391e+01 + 3 7.499999999999999e+02 -7.437674050373496e+02 9.311360962031355e+01 2.529630224920380e+01 + ME 2.010602101284343e+00 + +Event 8 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.195549862822020e+02 4.141972083174200e+02 3.478552699778412e+02 + 3 7.500000000000000e+02 -5.195549862822019e+02 -4.141972083174200e+02 -3.478552699778412e+02 + ME 2.004552700163305e+00 + +Event 9 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -5.542195584407184e+02 -3.922814044239862e+02 -3.185215546629776e+02 + 3 7.500000000000001e+02 5.542195584407184e+02 3.922814044239862e+02 3.185215546629777e+02 + ME 1.991552846979707e+00 + +Event 10 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.763918745729663e+01 4.519082493535349e+02 -5.983040976226634e+02 + 3 7.500000000000000e+02 1.763918745729668e+01 -4.519082493535349e+02 5.983040976226634e+02 + ME 3.238239815179103e+00 + +Event 11 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.439620635476698e+02 -5.926551785630562e+02 3.895412056149584e+02 + 3 7.500000000000001e+02 -2.439620635476698e+02 5.926551785630559e+02 -3.895412056149584e+02 + ME 2.039685380250985e+00 + +Event 12 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.246322345811117e+02 -1.588382746342976e+02 -6.977182538090112e+02 + 3 7.499999999999999e+02 -2.246322345811119e+02 1.588382746342976e+02 6.977182538090112e+02 + ME 7.660154481284926e+00 + +Event 13 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 1.323144312359150e+02 1.592517083468643e+02 -7.208548984888159e+02 + 3 7.499999999999998e+02 -1.323144312359151e+02 -1.592517083468644e+02 7.208548984888159e+02 + ME 1.137910388792190e+01 + +Event 14 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -7.006663333078419e+02 -2.674479229498733e+02 -6.188527994805768e+00 + 3 7.500000000000000e+02 7.006663333078420e+02 2.674479229498733e+02 6.188527994805669e+00 + ME 2.011060943142014e+00 + +Event 15 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -4.958624135651978e+02 -4.159237215561344e+02 3.789827498187641e+02 + 3 7.499999999999994e+02 4.958624135651975e+02 4.159237215561346e+02 -3.789827498187632e+02 + ME 2.028504154582342e+00 + +Event 16 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.469644380816109e+02 -1.127460289287336e+02 6.991401142001760e+02 + 3 7.499999999999998e+02 2.469644380816109e+02 1.127460289287336e+02 -6.991401142001760e+02 + ME 7.819200457562640e+00 + +Event 17 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -7.330123448568297e+02 -1.270424905118157e+02 -9.514782126800218e+01 + 3 7.500000000000003e+02 7.330123448568297e+02 1.270424905118154e+02 9.514782126800226e+01 + ME 2.004547006017963e+00 + +Event 18 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.632088213454307e+02 -7.202207510227294e+02 -1.309387277748307e+02 + 3 7.499999999999999e+02 1.632088213454307e+02 7.202207510227292e+02 1.309387277748307e+02 + ME 1.999378789386389e+00 + +Event 19 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 -6.437046519440436e+02 3.615612227659706e+02 1.319765254034735e+02 + 3 7.499999999999990e+02 6.437046519440436e+02 -3.615612227659711e+02 -1.319765254034727e+02 + ME 1.999216047420012e+00 + +Event 20 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 3.414032790372056e+02 6.619737876650544e+01 6.645010971451344e+02 + 3 7.500000000000000e+02 -3.414032790372056e+02 -6.619737876650579e+01 -6.645010971451345e+02 + ME 5.185737809672471e+00 + +Event 21 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.873372127826304e+02 1.330214527069503e+02 -2.690220233436987e+02 + 3 7.500000000000001e+02 6.873372127826304e+02 -1.330214527069503e+02 2.690220233436988e+02 + ME 1.983618346512617e+00 + +Event 22 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.102099953078523e+02 -1.032640414901363e+02 4.236511529094595e+02 + 3 7.500000000000001e+02 -6.102099953078524e+02 1.032640414901364e+02 -4.236511529094595e+02 + ME 2.089683691123227e+00 + +Event 23 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -5.344895803220363e+02 4.175443017228515e+02 3.201212967391419e+02 + 3 7.499999999999998e+02 5.344895803220362e+02 -4.175443017228516e+02 -3.201212967391422e+02 + ME 1.992070352051261e+00 + +Event 24 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.418425161536912e+02 -5.777728680811457e+02 -4.566828369015798e+02 + 3 7.499999999999984e+02 -1.418425161536913e+02 5.777728680811457e+02 4.566828369015796e+02 + ME 2.164935885748348e+00 + +Event 25 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.716447327703473e+02 -2.655667876457525e+02 -5.948566834991209e+02 + 3 7.500000000000003e+02 3.716447327703468e+02 2.655667876457529e+02 5.948566834991209e+02 + ME 3.182266218087864e+00 + +Event 26 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 6.982684047175275e+02 2.520499043566510e+02 1.067336904966802e+02 + 3 7.499999999999982e+02 -6.982684047175262e+02 -2.520499043566514e+02 -1.067336904966819e+02 + ME 2.002985852932905e+00 + +Event 27 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.841422120170265e+02 -2.875466445556266e+02 3.722832300072826e+02 + 3 7.500000000000002e+02 5.841422120170264e+02 2.875466445556268e+02 -3.722832300072830e+02 + ME 2.022282544333713e+00 + +Event 28 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -5.475651347914464e+02 9.856189468224678e-02 -5.125157689733829e+02 + 3 7.499999999999998e+02 5.475651347914463e+02 -9.856189468233521e-02 5.125157689733828e+02 + ME 2.387846999289178e+00 + +Event 29 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -4.650224466071159e+02 8.783714288524544e+01 -5.818408377565889e+02 + 3 7.500000000000002e+02 4.650224466071161e+02 -8.783714288524529e+01 5.818408377565888e+02 + ME 2.993624880577912e+00 + +Event 30 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.782526290173227e+02 -4.370648563926671e+02 -5.828381521098888e+02 + 3 7.499999999999999e+02 -1.782526290173226e+02 4.370648563926671e+02 5.828381521098888e+02 + ME 3.006916608849429e+00 + +Event 31 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.241445908528016e+01 -6.984630617313881e+02 -2.681456243827463e+02 + 3 7.500000000000001e+02 -5.241445908528017e+01 6.984630617313881e+02 2.681456243827463e+02 + ME 1.983591630570634e+00 + +Event 32 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 4.502669185499846e+02 -4.340701284735799e+02 -4.139357747603254e+02 + 3 7.500000000000022e+02 -4.502669185499836e+02 4.340701284735788e+02 4.139357747603258e+02 + ME 2.073003851963782e+00 + +Event 33 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 3.654307823639496e+02 -1.558862065836156e+02 -6.361287871947329e+02 + 3 7.499999999999986e+02 -3.654307823639502e+02 1.558862065836151e+02 6.361287871947329e+02 + ME 4.085426045280015e+00 + +Event 34 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.482796593496792e+02 5.744589614092289e+02 4.588137359318844e+02 + 3 7.499999999999999e+02 1.482796593496794e+02 -5.744589614092289e+02 -4.588137359318844e+02 + ME 2.170948319867109e+00 + +Event 35 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -5.948173619449914e+02 -4.238736069615458e+02 1.703627636844680e+02 + 3 7.499999999999997e+02 5.948173619449915e+02 4.238736069615456e+02 -1.703627636844681e+02 + ME 1.993014678181581e+00 + +Event 36 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.826165317041466e+02 4.475925182298392e+02 -5.313086048919125e+02 + 3 7.500000000000006e+02 -2.826165317041466e+02 -4.475925182298394e+02 5.313086048919129e+02 + ME 2.503977667659708e+00 + +Event 37 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.291270169087662e+02 -1.025561575994625e+02 5.215427446678407e+02 + 3 7.499999999999999e+02 -5.291270169087661e+02 1.025561575994623e+02 -5.215427446678407e+02 + ME 2.440305740927076e+00 + +Event 38 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.783993127041845e+02 -6.682289316640183e+02 2.900754731338715e+02 + 3 7.499999999999999e+02 1.783993127041844e+02 6.682289316640183e+02 -2.900754731338715e+02 + ME 1.985304274456458e+00 + +Event 39 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999965e+02 7.035220480385169e+02 -2.585746933707227e+02 2.637908019484471e+01 + 3 7.499999999999966e+02 -7.035220480385183e+02 2.585746933707188e+02 -2.637908019484133e+01 + ME 2.010559599140099e+00 + +Event 40 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 1.045990218130194e+00 -7.391865839862932e+02 -1.268940501329283e+02 + 3 7.500000000000000e+02 -1.045990218130196e+00 7.391865839862933e+02 1.268940501329283e+02 + ME 2.000007807605100e+00 + +Event 41 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 4.516393832142919e+02 -5.982487717061383e+02 2.490531433069624e+01 + 3 7.499999999999998e+02 -4.516393832142917e+02 5.982487717061382e+02 -2.490531433069631e+01 + ME 2.010617017961215e+00 + +Event 42 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.404063782711548e+02 -2.677733538887419e+02 -6.123078707476250e+02 + 3 7.499999999999999e+02 3.404063782711548e+02 2.677733538887418e+02 6.123078707476250e+02 + ME 3.496555157380759e+00 + +Event 43 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 4.431989925840304e+02 -4.375135005185252e+02 4.179193580543111e+02 + 3 7.500000000000002e+02 -4.431989925840304e+02 4.375135005185252e+02 -4.179193580543111e+02 + ME 2.079583697162946e+00 + +Event 44 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.639601113951325e+02 7.019963091374882e+02 -5.122650643294619e+00 + 3 7.499999999999998e+02 -2.639601113951325e+02 -7.019963091374882e+02 5.122650643294828e+00 + ME 2.011070173201013e+00 + +Event 45 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.618223353367828e+02 2.920485830812342e+02 5.137389957014393e+02 + 3 7.499999999999999e+02 4.618223353367827e+02 -2.920485830812343e+02 -5.137389957014392e+02 + ME 2.394622186242569e+00 + +Event 46 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000191e+02 -7.051112468437821e+02 2.555740963349632e+02 1.041966542134262e-01 + 3 7.499999999999935e+02 7.051112468437943e+02 -2.555740963349714e+02 -1.041966542091747e-01 + ME 2.011090259550378e+00 + +Event 47 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.814786796723018e+02 -1.264994422665163e+02 -6.332234266526609e+02 + 3 7.500000000000001e+02 3.814786796723018e+02 1.264994422665164e+02 6.332234266526608e+02 + ME 4.000948864693926e+00 + +Event 48 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -5.223718392138900e+02 -9.075826002414982e+01 5.304626281216113e+02 + 3 7.500000000000000e+02 5.223718392138899e+02 9.075826002414982e+01 -5.304626281216113e+02 + ME 2.498154652615908e+00 + +Event 49 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999975e+02 2.512062929704643e+02 5.497568647487620e+02 -4.440301656798204e+02 + 3 7.500000000000001e+02 -2.512062929704639e+02 -5.497568647487620e+02 4.440301656798205e+02 + ME 2.132341642526155e+00 + +Event 50 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.081599294091975e+02 -4.985692845883419e+02 3.838412959671587e+02 + 3 7.500000000000002e+02 4.081599294091976e+02 4.985692845883420e+02 -3.838412959671588e+02 + ME 2.033431466872660e+00 + +Event 51 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999981e+02 6.013537967828100e+02 1.484120711712781e+02 4.229036157631559e+02 + 3 7.499999999999998e+02 -6.013537967828098e+02 -1.484120711712782e+02 -4.229036157631558e+02 + ME 2.088322753728032e+00 + +Event 52 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.788389248314356e+01 -6.412729991041272e+02 3.870836489834254e+02 + 3 7.500000000000001e+02 -3.788389248314355e+01 6.412729991041272e+02 -3.870836489834253e+02 + ME 2.036924470326421e+00 + +Event 53 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.721956772389075e+02 -5.973666994060347e+02 2.590818486220348e+02 + 3 7.499999999999999e+02 3.721956772389076e+02 5.973666994060347e+02 -2.590818486220348e+02 + ME 1.983491962850991e+00 + +Event 54 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.279511131417045e+02 4.100901816942154e+02 1.855365759053465e+00 + 3 7.500000000000002e+02 -6.279511131417045e+02 -4.100901816942154e+02 -1.855365759053460e+00 + ME 2.011087631467305e+00 + +Event 55 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 2.906696961973025e+02 5.783283149374524e+02 -3.788766129680948e+02 + 3 7.500000000000013e+02 -2.906696961973020e+02 -5.783283149374529e+02 3.788766129680953e+02 + ME 2.028400504671404e+00 + +Event 56 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.052921399041344e+02 -6.285366974145302e+02 -3.539728199972655e+02 + 3 7.500000000000000e+02 -2.052921399041344e+02 6.285366974145302e+02 3.539728199972655e+02 + ME 2.008326264666069e+00 + +Event 57 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.566747271383028e+02 -2.172482942051992e+01 -5.021367144037026e+02 + 3 7.500000000000002e+02 -5.566747271383028e+02 2.172482942051981e+01 5.021367144037026e+02 + ME 2.334236035577041e+00 + +Event 58 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -1.111752699385218e+02 -4.460916785360812e+02 -5.925725893888097e+02 + 3 7.500000000000000e+02 1.111752699385215e+02 4.460916785360812e+02 5.925725893888097e+02 + ME 3.146652063342941e+00 + +Event 59 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 8.715681773052760e+00 -1.696629445817669e+02 -7.305056619404637e+02 + 3 7.499999999999999e+02 -8.715681773052756e+00 1.696629445817669e+02 7.305056619404637e+02 + ME 1.416956033396763e+01 + +Event 60 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000018e+02 6.928032817602112e+02 2.624225493926031e+02 1.168675247986685e+02 + 3 7.500000000000007e+02 -6.928032817602116e+02 -2.624225493926037e+02 -1.168675247986700e+02 + ME 2.001525295764226e+00 + +Event 61 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000011e+02 -3.549971539677352e+02 2.839011090540922e+02 -5.965544241330084e+02 + 3 7.500000000000007e+02 3.549971539677362e+02 -2.839011090540924e+02 5.965544241330070e+02 + ME 3.209489797994468e+00 + +Event 62 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -3.075091268532094e+02 5.438463972512831e+02 4.149328055225047e+02 + 3 7.500000000000003e+02 3.075091268532095e+02 -5.438463972512831e+02 -4.149328055225046e+02 + ME 2.074617910784855e+00 + +Event 63 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 5.476352128383519e+02 -1.524963074254458e+02 4.892244371258423e+02 + 3 7.500000000000001e+02 -5.476352128383519e+02 1.524963074254458e+02 -4.892244371258422e+02 + ME 2.276300836068042e+00 + +Event 64 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.967977363399760e+02 1.264105709968621e+02 -6.237563017523095e+02 + 3 7.500000000000000e+02 -3.967977363399760e+02 -1.264105709968620e+02 6.237563017523095e+02 + ME 3.751976516771654e+00 + +Event 65 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.816326009293766e+02 -1.978646582155085e+02 2.423356646874607e+02 + 3 7.499999999999998e+02 -6.816326009293764e+02 1.978646582155085e+02 -2.423356646874606e+02 + ME 1.984054085169622e+00 + +Event 66 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.665640119169115e+01 -7.284851156281566e+02 -1.745443913733847e+02 + 3 7.500000000000002e+02 3.665640119169112e+01 7.284851156281566e+02 1.745443913733847e+02 + ME 1.992344720708242e+00 + +Event 67 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 7.153305678246726e+02 1.717790906820774e+02 -1.459250586433670e+02 + 3 7.499999999999997e+02 -7.153305678246727e+02 -1.717790906820775e+02 1.459250586433670e+02 + ME 1.996986995028635e+00 + +Event 68 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.487016905468892e+02 8.269239097845504e+01 -5.952552069277764e+02 + 3 7.500000000000000e+02 4.487016905468893e+02 -8.269239097845502e+01 5.952552069277764e+02 + ME 3.188598120955346e+00 + +Event 69 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000244e+02 4.919500575481476e+02 -2.881927178399743e+02 -4.872679942930075e+02 + 3 7.499999999999948e+02 -4.919500575481655e+02 2.881927178399841e+02 4.872679942930299e+02 + ME 2.268286827319417e+00 + +Event 70 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.773996828232634e+02 1.950566034399255e+02 -6.180796072185036e+02 + 3 7.500000000000000e+02 3.773996828232636e+02 -1.950566034399255e+02 6.180796072185035e+02 + ME 3.619701258963251e+00 + +Event 71 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 5.046979384687710e+02 -3.405008111295478e+02 4.379945074147525e+02 + 3 7.499999999999985e+02 -5.046979384687706e+02 3.405008111295465e+02 -4.379945074147532e+02 + ME 2.118527835260358e+00 + +Event 72 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -2.079545016377724e+02 4.542001484952560e+02 5.594257326540076e+02 + 3 7.500000000000000e+02 2.079545016377724e+02 -4.542001484952560e+02 -5.594257326540076e+02 + ME 2.737044891305738e+00 + +Event 73 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.692745262677147e+02 2.933889353680814e+02 1.688032614990298e+02 + 3 7.500000000000000e+02 6.692745262677147e+02 -2.933889353680814e+02 -1.688032614990298e+02 + ME 1.993266062385568e+00 + +Event 74 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 4.336394009948918e+02 -5.214090606974566e+00 -6.119065967645178e+02 + 3 7.500000000000001e+02 -4.336394009948917e+02 5.214090606974610e+00 6.119065967645178e+02 + ME 3.488386859223064e+00 + +Event 75 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.886320510875878e+01 -1.836844277890389e+02 -7.247724843508738e+02 + 3 7.500000000000001e+02 5.886320510875916e+01 1.836844277890389e+02 7.247724843508737e+02 + ME 1.237452317062543e+01 + +Event 76 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.688146793241962e+02 4.830467602600368e+02 3.307243925875947e+02 + 3 7.499999999999999e+02 -4.688146793241962e+02 -4.830467602600368e+02 -3.307243925875947e+02 + ME 1.996022578666890e+00 + +Event 77 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -6.339445789702437e+02 -2.102547003631318e+02 -3.411850403658548e+02 + 3 7.499999999999999e+02 6.339445789702437e+02 2.102547003631319e+02 3.411850403658547e+02 + ME 2.000891094991024e+00 + +Event 78 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.669825029292291e+02 -6.850785338298774e+02 1.479450763464257e+02 + 3 7.500000000000002e+02 -2.669825029292290e+02 6.850785338298771e+02 -1.479450763464256e+02 + ME 1.996659436127080e+00 + +Event 79 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -3.533967556727067e+02 -1.417706364842367e+02 -6.461515454681355e+02 + 3 7.499999999999999e+02 3.533967556727067e+02 1.417706364842366e+02 6.461515454681355e+02 + ME 4.411164889613540e+00 + +Event 80 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.262373668943077e+02 -6.163135238556597e+02 -3.129452628857666e+01 + 3 7.500000000000001e+02 4.262373668943077e+02 6.163135238556597e+02 3.129452628857667e+01 + ME 2.010344710722510e+00 + +Event 81 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -7.465617073786625e+02 6.074723773897961e+01 -3.814957644509608e+01 + 3 7.499999999999998e+02 7.465617073786623e+02 -6.074723773897957e+01 3.814957644509607e+01 + ME 2.009985571987386e+00 + +Event 82 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 7.475659550619685e+02 -4.211883263705301e+00 6.022792435124191e+01 + 3 7.499999999999999e+02 -7.475659550619689e+02 4.211883263705446e+00 -6.022792435124160e+01 + ME 2.008374203929284e+00 + +Event 83 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.388305538878825e+02 2.707043584000302e+02 -2.848063794273738e+02 + 3 7.500000000000001e+02 -6.388305538878825e+02 -2.707043584000302e+02 2.848063794273738e+02 + ME 1.984680018426294e+00 + +Event 84 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.782135480622982e+02 -6.945359840166777e+02 -2.199083859088391e+02 + 3 7.499999999999998e+02 1.782135480622988e+02 6.945359840166777e+02 2.199083859088391e+02 + ME 1.985988745513523e+00 + +Event 85 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.627525077696844e+02 1.216078070177454e+02 -6.450753129512397e+02 + 3 7.499999999999999e+02 3.627525077696847e+02 -1.216078070177455e+02 6.450753129512397e+02 + ME 4.373397884360151e+00 + +Event 86 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.904296033356203e+02 8.239172209485156e+01 4.550873406668076e+02 + 3 7.500000000000001e+02 5.904296033356203e+02 -8.239172209485157e+01 -4.550873406668076e+02 + ME 2.160536921334943e+00 + +Event 87 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.743433600636463e+01 -2.654341936409747e+02 6.982100680837729e+02 + 3 7.499999999999999e+02 -6.743433600636475e+01 2.654341936409747e+02 -6.982100680837726e+02 + ME 7.714450680030922e+00 + +Event 88 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.123842692071390e+02 6.372819831147648e+02 -2.424618368923954e+02 + 3 7.500000000000001e+02 3.123842692071391e+02 -6.372819831147648e+02 2.424618368923954e+02 + ME 1.984046675304770e+00 + +Event 89 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000013e+02 2.445149472352194e+02 2.427088730721040e+02 -6.661867932574363e+02 + 3 7.499999999999998e+02 -2.445149472352193e+02 -2.427088730721042e+02 6.661867932574366e+02 + ME 5.271748569268313e+00 + +Event 90 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -6.884814144916876e+01 7.440438304666905e+02 -6.448807458320981e+01 + 3 7.500000000000000e+02 6.884814144916906e+01 -7.440438304666907e+02 6.448807458320999e+01 + ME 2.007986879860101e+00 + +Event 91 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999984e+02 4.141760802881196e+02 -5.972800904748399e+02 1.849720736747397e+02 + 3 7.499999999999998e+02 -4.141760802881197e+02 5.972800904748400e+02 -1.849720736747392e+02 + ME 1.990710544351959e+00 + +Event 92 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 3.924404784338562e+01 6.957303155217187e+02 2.773431678857735e+02 + 3 7.500000000000002e+02 -3.924404784338608e+01 -6.957303155217187e+02 -2.773431678857725e+02 + ME 1.984035246923061e+00 + +Event 93 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -4.290083413418521e+02 -2.957602135933498e+02 5.394235248061855e+02 + 3 7.500000000000003e+02 4.290083413418521e+02 2.957602135933496e+02 -5.394235248061854e+02 + ME 2.563041219586656e+00 + +Event 94 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -5.430046970424431e+02 5.127268066666062e+02 6.897188351284382e+01 + 3 7.500000000000002e+02 5.430046970424431e+02 -5.127268066666062e+02 -6.897188351284385e+01 + ME 2.007553878294709e+00 + +Event 95 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.670174710353043e+02 -4.266096910567015e+02 -5.938091746221820e+02 + 3 7.500000000000003e+02 -1.670174710353046e+02 4.266096910567016e+02 5.938091746221818e+02 + ME 3.165791387662161e+00 + +Event 96 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.630781801178598e+02 4.564206587749571e+02 -5.338258642940684e+02 + 3 7.499999999999999e+02 2.630781801178598e+02 -4.564206587749570e+02 5.338258642940684e+02 + ME 2.521668423905222e+00 + +Event 97 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 3.518870680096824e+02 -5.345219674188427e+02 -3.911032571000131e+02 + 3 7.500000000000005e+02 -3.518870680096824e+02 5.345219674188430e+02 3.911032571000131e+02 + ME 2.041492391644695e+00 + +Event 98 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.752074620001102e+02 -5.701512527722717e+02 -1.077284410990111e+02 + 3 7.500000000000003e+02 -4.752074620001106e+02 5.701512527722713e+02 1.077284410990113e+02 + ME 2.002846157195692e+00 + +Event 99 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.725046135512981e+02 -2.442507613156968e+02 6.033919774737651e+02 + 3 7.500000000000001e+02 3.725046135512981e+02 2.442507613156967e+02 -6.033919774737650e+02 + ME 3.326050357956303e+00 + +Event 100 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -6.744099165829510e+02 -2.914668115726679e+02 1.507261164040269e+02 + 3 7.500000000000005e+02 6.744099165829509e+02 2.914668115726684e+02 -1.507261164040272e+02 + ME 1.996207314305127e+00 + +Event 101 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -1.607743247254612e+01 -6.958012347235277e+02 2.794676330495263e+02 + 3 7.500000000000001e+02 1.607743247254616e+01 6.958012347235277e+02 -2.794676330495263e+02 + ME 1.984191496570907e+00 + +Event 102 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 6.752700687038076e+02 -3.061796867633290e+02 -1.129793508844210e+02 + 3 7.500000000000001e+02 -6.752700687038074e+02 3.061796867633290e+02 1.129793508844210e+02 + ME 2.002095206274886e+00 + +Event 103 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.709405901562356e+02 -3.521101236710536e+02 6.042430449861606e+02 + 3 7.500000000000014e+02 -2.709405901562357e+02 3.521101236710547e+02 -6.042430449861606e+02 + ME 3.341378826640739e+00 + +Event 104 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000019e+02 -5.896850660222877e+02 -3.933704689676251e+02 2.450126467235561e+02 + 3 7.499999999999995e+02 5.896850660222864e+02 3.933704689676245e+02 -2.450126467235561e+02 + ME 1.983906451475833e+00 + +Event 105 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.232008513548739e+02 2.751892248186205e+02 -6.183139172194097e+02 + 3 7.499999999999999e+02 3.232008513548740e+02 -2.751892248186206e+02 6.183139172194099e+02 + ME 3.624933423198033e+00 + +Event 106 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000022e+02 -6.293500647395228e+02 3.467904123852736e+02 -2.148369285993608e+02 + 3 7.499999999999977e+02 6.293500647395228e+02 -3.467904123852738e+02 2.148369285993607e+02 + ME 1.986570256908784e+00 + +Event 107 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -3.164803773315051e+02 6.485956071491387e+02 -2.041173906137060e+02 + 3 7.499999999999999e+02 3.164803773315048e+02 -6.485956071491385e+02 2.041173906137067e+02 + ME 1.987933572364906e+00 + +Event 108 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 1.270898770881453e+02 1.543847712987803e+02 -7.228509566520220e+02 + 3 7.499999999999999e+02 -1.270898770881454e+02 -1.543847712987803e+02 7.228509566520220e+02 + ME 1.186640788044667e+01 + +Event 109 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.270964825883011e+02 4.495607724123360e+02 2.873402337840359e+02 + 3 7.499999999999999e+02 -5.270964825883012e+02 -4.495607724123360e+02 -2.873402337840358e+02 + ME 1.984961998572323e+00 + +Event 110 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -7.153664733993946e+02 -2.099256997662553e+02 -8.174355824015288e+01 + 3 7.500000000000001e+02 7.153664733993946e+02 2.099256997662553e+02 8.174355824015289e+01 + ME 2.006184439353365e+00 + +Event 111 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999986e+02 -4.856864011705250e+02 4.693412062195835e+02 -3.260790576875004e+02 + 3 7.499999999999993e+02 4.856864011705258e+02 -4.693412062195836e+02 3.260790576875023e+02 + ME 1.994176128857618e+00 + +Event 112 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.064700140846772e+02 5.272494748911355e+02 1.673203994140850e+02 + 3 7.500000000000005e+02 5.064700140846775e+02 -5.272494748911357e+02 -1.673203994140850e+02 + ME 1.993505729951682e+00 + +Event 113 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 5.589223488778573e+02 3.321278479133653e+02 3.738942371383856e+02 + 3 7.500000000000001e+02 -5.589223488778573e+02 -3.321278479133653e+02 -3.738942371383856e+02 + ME 2.023720122294617e+00 + +Event 114 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -4.977003556727846e+01 -1.428011991695803e+02 7.345956446068556e+02 + 3 7.500000000000001e+02 4.977003556727855e+01 1.428011991695803e+02 -7.345956446068556e+02 + ME 1.578342838955713e+01 + +Event 115 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.329003126925221e+02 7.085629881653696e+02 7.871426903610248e+01 + 3 7.499999999999999e+02 -2.329003126925221e+02 -7.085629881653693e+02 -7.871426903610259e+01 + ME 2.006526871110616e+00 + +Event 116 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -1.524145310251806e+01 9.339299182317880e+01 7.440063488879913e+02 + 3 7.500000000000000e+02 1.524145310251808e+01 -9.339299182317883e+01 -7.440063488879913e+02 + ME 2.124419017838906e+01 + +Event 117 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 8.625602426427481e+01 6.794638862165408e+02 -3.055956897694005e+02 + 3 7.500000000000019e+02 -8.625602426427798e+01 -6.794638862165372e+02 3.055956897693977e+02 + ME 1.988059503765081e+00 + +Event 118 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -4.994800131986639e+02 -5.547627166697749e+02 7.251237555226915e+01 + 3 7.499999999999997e+02 4.994800131986638e+02 5.547627166697752e+02 -7.251237555226918e+01 + ME 2.007194060980918e+00 + +Event 119 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.696453710916101e+02 -6.412256459582578e+02 -3.500715912961300e+02 + 3 7.499999999999997e+02 1.696453710916101e+02 6.412256459582578e+02 3.500715912961301e+02 + ME 2.005872415478047e+00 + +Event 120 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.909680497736684e+02 -4.472542030924528e+02 -1.149801807392985e+02 + 3 7.499999999999998e+02 -5.909680497736686e+02 4.472542030924529e+02 1.149801807392985e+02 + ME 2.001803340914013e+00 + +Event 121 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 4.403248055966552e+02 -1.878335448594353e+02 5.773496557562032e+02 + 3 7.500000000000001e+02 -4.403248055966552e+02 1.878335448594353e+02 -5.773496557562032e+02 + ME 2.935937839381182e+00 + +Event 122 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -7.911542100677308e+01 4.343577103010796e+02 6.062789206800156e+02 + 3 7.499999999999997e+02 7.911542100677336e+01 -4.343577103010785e+02 -6.062789206800155e+02 + ME 3.378827638495415e+00 + +Event 123 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.897831726795230e+02 -1.483823971281492e+02 5.482655451352789e+02 + 3 7.499999999999997e+02 -4.897831726795229e+02 1.483823971281491e+02 -5.482655451352789e+02 + ME 2.634572486292920e+00 + +Event 124 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.072168527361445e+02 3.676168647522196e+02 -6.200153358521670e+02 + 3 7.500000000000000e+02 2.072168527361444e+02 -3.676168647522194e+02 6.200153358521670e+02 + ME 3.663499416923802e+00 + +Event 125 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.485400265882382e+02 -5.747265047667001e+02 -4.583942683103900e+02 + 3 7.500000000000002e+02 1.485400265882382e+02 5.747265047667001e+02 4.583942683103899e+02 + ME 2.169752204214021e+00 + +Event 126 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 1.787754338344337e+02 4.314693606084867e+02 5.868334815886135e+02 + 3 7.500000000000003e+02 -1.787754338344338e+02 -4.314693606084865e+02 -5.868334815886134e+02 + ME 3.062012798921335e+00 + +Event 127 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -1.824698245714143e+02 -6.992613640416623e+02 -2.005948849783361e+02 + 3 7.500000000000002e+02 1.824698245714143e+02 6.992613640416619e+02 2.005948849783361e+02 + ME 1.988415763442605e+00 + +Event 128 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.808356150498172e+02 5.501654866182587e+02 -3.387951765355025e+02 + 3 7.500000000000001e+02 -3.808356150498172e+02 -5.501654866182587e+02 3.387951765355026e+02 + ME 1.999687906503314e+00 + +Event 129 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.909177663532490e+02 -7.106409991592089e+00 4.618069860289238e+02 + 3 7.500000000000000e+02 -5.909177663532489e+02 7.106409991592302e+00 -4.618069860289237e+02 + ME 2.179665903384003e+00 + +Event 130 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999984e+02 -6.434408032869533e+02 1.129979432354512e+02 3.683957077518649e+02 + 3 7.499999999999989e+02 6.434408032869521e+02 -1.129979432354518e+02 -3.683957077518653e+02 + ME 2.018960418239853e+00 + +Event 131 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 6.746515082539579e+02 1.884449229635104e+02 2.680183826157030e+02 + 3 7.500000000000000e+02 -6.746515082539579e+02 -1.884449229635102e+02 -2.680183826157021e+02 + ME 1.983588012685104e+00 + +Event 132 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.558298868453927e+02 9.921998690776338e+01 -6.980017644638849e+02 + 3 7.499999999999997e+02 2.558298868453928e+02 -9.921998690776336e+01 6.980017644638850e+02 + ME 7.691362840915397e+00 + +Event 133 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 3.135802854217638e+02 -4.216148118485033e+02 5.351713324018350e+02 + 3 7.499999999999995e+02 -3.135802854217635e+02 4.216148118485038e+02 -5.351713324018348e+02 + ME 2.531352293421454e+00 + +Event 134 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.181187851375606e+02 2.100869264433576e+02 7.102192872171314e+02 + 3 7.499999999999999e+02 1.181187851375606e+02 -2.100869264433576e+02 -7.102192872171313e+02 + ME 9.317293972715463e+00 + +Event 135 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.827401847882261e+02 -7.210896591418211e+02 -9.558100408709542e+01 + 3 7.500000000000001e+02 1.827401847882261e+02 7.210896591418210e+02 9.558100408709542e+01 + ME 2.004490941604272e+00 + +Event 136 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999965e+02 1.347097060534662e+02 3.165403007658666e+02 6.664499479226027e+02 + 3 7.500000000000008e+02 -1.347097060534673e+02 -3.165403007658664e+02 -6.664499479226030e+02 + ME 5.285444697457056e+00 + +Event 137 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.709304925774791e+02 -4.143354602318728e+02 -2.546850977966301e+02 + 3 7.500000000000000e+02 -5.709304925774790e+02 4.143354602318728e+02 2.546850977966302e+02 + ME 1.983552010334113e+00 + +Event 138 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.230024938116915e+02 -6.078146322618757e+01 7.134952708195409e+02 + 3 7.500000000000000e+02 2.230024938116915e+02 6.078146322618757e+01 -7.134952708195409e+02 + ME 9.871519249046608e+00 + +Event 139 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.938095139105243e+02 -4.125095274968337e+02 3.853414767395157e+02 + 3 7.499999999999999e+02 4.938095139105243e+02 4.125095274968338e+02 -3.853414767395157e+02 + ME 2.035026781271170e+00 + +Event 140 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.150639064959597e+02 7.154069121581596e+02 -6.663682284611146e+01 + 3 7.500000000000010e+02 -2.150639064959609e+02 -7.154069121581591e+02 6.663682284611136e+01 + ME 2.007782577771487e+00 + +Event 141 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.001994340787286e+02 -3.935615219448572e+02 2.176464283557658e+02 + 3 7.500000000000001e+02 -6.001994340787286e+02 3.935615219448572e+02 -2.176464283557658e+02 + ME 1.986242560158687e+00 + +Event 142 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 3.402076114496817e+02 6.461448067635170e+02 -1.710428887859415e+02 + 3 7.500000000000002e+02 -3.402076114496816e+02 -6.461448067635171e+02 1.710428887859415e+02 + ME 1.992905283508286e+00 + +Event 143 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 -2.483174680463646e+02 4.871846387911280e+02 -5.133123442788026e+02 + 3 7.499999999999984e+02 2.483174680463649e+02 -4.871846387911279e+02 5.133123442788022e+02 + ME 2.392247598308911e+00 + +Event 144 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -2.342206172984319e+01 7.464718320769290e+02 -6.878379852869921e+01 + 3 7.500000000000003e+02 2.342206172984328e+01 -7.464718320769291e+02 6.878379852869914e+01 + ME 2.007572555109217e+00 + +Event 145 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999972e+02 6.139969952391268e+02 -3.976063262667790e+02 -1.655804914534163e+02 + 3 7.500000000000030e+02 -6.139969952391284e+02 3.976063262667795e+02 1.655804914534169e+02 + ME 1.993787627301450e+00 + +Event 146 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -7.489741806317043e+02 -1.872912227594817e+01 3.445136754673255e+01 + 3 7.500000000000001e+02 7.489741806317041e+02 1.872912227594822e+01 -3.445136754673256e+01 + ME 2.010187867275871e+00 + +Event 147 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.413828145683407e+02 3.157741422579704e+02 -6.654303801495521e+02 + 3 7.500000000000000e+02 1.413828145683408e+02 -3.157741422579705e+02 6.654303801495525e+02 + ME 5.232788324013259e+00 + +Event 148 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -6.671561067841420e+02 2.284674470436643e+02 2.553533920314686e+02 + 3 7.500000000000001e+02 6.671561067841420e+02 -2.284674470436643e+02 -2.553533920314688e+02 + ME 1.983538597988968e+00 + +Event 149 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 3.495458719419115e+02 -1.546896579009742e+02 -6.452819485673282e+02 + 3 7.499999999999998e+02 -3.495458719419115e+02 1.546896579009742e+02 6.452819485673284e+02 + ME 4.380593272658192e+00 + +Event 150 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.603068721757451e+01 7.476532040007083e+02 1.937133545739223e+01 + 3 7.499999999999994e+02 5.603068721757452e+01 -7.476532040007069e+02 -1.937133545739284e+01 + ME 2.010803533079019e+00 + +Event 151 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 -2.349239539523203e+01 2.915649777816018e+02 -6.906069584718988e+02 + 3 7.499999999999984e+02 2.349239539523064e+01 -2.915649777816013e+02 6.906069584718991e+02 + ME 6.951008749064900e+00 + +Event 152 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 1.325883219266814e+02 7.097564553439875e+02 2.028943345345159e+02 + 3 7.500000000000000e+02 -1.325883219266814e+02 -7.097564553439880e+02 -2.028943345345148e+02 + ME 1.988099263967034e+00 + +Event 153 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.538006601590899e+02 -6.984195271964692e+02 -1.014661960136019e+02 + 3 7.500000000000003e+02 2.538006601590900e+02 6.984195271964692e+02 1.014661960136019e+02 + ME 2.003711111593866e+00 + +Event 154 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.323172710781250e+02 -6.860984228486728e+02 2.725088878165297e+02 + 3 7.500000000000000e+02 1.323172710781251e+02 6.860984228486726e+02 -2.725088878165297e+02 + ME 1.983756374901854e+00 + +Event 155 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.452704712456864e+02 -4.996070210616754e+02 1.247915770129769e+02 + 3 7.500000000000001e+02 5.452704712456864e+02 4.996070210616755e+02 -1.247915770129769e+02 + ME 2.000331222506823e+00 + +Event 156 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000016e+02 -1.756428583566509e+02 5.398397609473058e+02 -4.901251052625874e+02 + 3 7.500000000000000e+02 1.756428583566503e+02 -5.398397609473059e+02 4.901251052625868e+02 + ME 2.280054547244574e+00 + +Event 157 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 -7.343271291233364e+02 5.984001257480936e+01 1.402955463602692e+02 + 3 7.500000000000000e+02 7.343271291233360e+02 -5.984001257480933e+01 -1.402955463602694e+02 + ME 1.997894644163825e+00 + +Event 158 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999990e+02 5.956876772329629e+02 -3.688982317556030e+02 -2.675262338545276e+02 + 3 7.499999999999999e+02 -5.956876772329626e+02 3.688982317556029e+02 2.675262338545273e+02 + ME 1.983574637868442e+00 + +Event 159 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 4.610896376803973e+02 -1.089359794187623e+02 5.814028710041215e+02 + 3 7.499999999999999e+02 -4.610896376803973e+02 1.089359794187621e+02 -5.814028710041215e+02 + ME 2.987844404961483e+00 + +Event 160 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 7.080441809468048e+02 2.091039913840823e+02 -1.320945063756017e+02 + 3 7.500000000000002e+02 -7.080441809468050e+02 -2.091039913840823e+02 1.320945063756016e+02 + ME 1.999197513465153e+00 + +Event 161 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.069769124239758e+02 3.104285773922232e+02 3.125909885497298e+02 + 3 7.499999999999997e+02 -6.069769124239758e+02 -3.104285773922232e+02 -3.125909885497298e+02 + ME 1.989802440637427e+00 + +Event 162 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.828417389515272e+01 3.828517565470377e+02 6.412969702909132e+02 + 3 7.500000000000001e+02 -6.828417389515273e+01 -3.828517565470376e+02 -6.412969702909131e+02 + ME 4.246334187981941e+00 + +Event 163 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 2.807789436749608e+02 6.954089716815359e+02 -8.339478017354233e+00 + 3 7.499999999999999e+02 -2.807789436749607e+02 -6.954089716815359e+02 8.339478017354306e+00 + ME 2.011037025935879e+00 + +Event 164 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 3.142343236884305e+02 3.091601587257571e+02 -6.067757296338901e+02 + 3 7.500000000000000e+02 -3.142343236884303e+02 -3.091601587257572e+02 6.067757296338900e+02 + ME 3.388136839521116e+00 + +Event 165 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.065756695626629e+02 4.857807362514072e+02 2.643826153403072e+02 + 3 7.499999999999993e+02 5.065756695626631e+02 -4.857807362514072e+02 -2.643826153403074e+02 + ME 1.983511995791894e+00 + +Event 166 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.582369138095781e+02 -7.322863531814778e+02 3.489664815125224e+01 + 3 7.500000000000001e+02 -1.582369138095780e+02 7.322863531814778e+02 -3.489664815125219e+01 + ME 2.010164566736145e+00 + +Event 167 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 4.506018848986033e+02 -1.448154652770772e+02 -5.817958596813918e+02 + 3 7.499999999999998e+02 -4.506018848986031e+02 1.448154652770771e+02 5.817958596813919e+02 + ME 2.993029659647449e+00 + +Event 168 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.383838657580175e+02 4.266457451013773e+02 -6.011017500263644e+02 + 3 7.500000000000000e+02 -1.383838657580178e+02 -4.266457451013773e+02 6.011017500263642e+02 + ME 3.285727681812074e+00 + +Event 169 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 4.787059935025895e+02 5.010202352445186e+02 -2.869133940063202e+02 + 3 7.499999999999999e+02 -4.787059935025894e+02 -5.010202352445185e+02 2.869133940063202e+02 + ME 1.984912165231123e+00 + +Event 170 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.735853015668399e+02 3.083708463739302e+01 6.976389875699663e+02 + 3 7.499999999999999e+02 2.735853015668399e+02 -3.083708463739308e+01 -6.976389875699662e+02 + ME 7.651473042207752e+00 + +Event 171 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.071409269801590e+01 6.637658831421408e+02 -3.438439345464608e+02 + 3 7.499999999999998e+02 -6.071409269801586e+01 -6.637658831421406e+02 3.438439345464608e+02 + ME 2.002296135796218e+00 + +Event 172 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -7.459627102375574e+02 -5.960011581968069e+01 4.987445367439287e+01 + 3 7.500000000000014e+02 7.459627102375568e+02 5.960011581968120e+01 -4.987445367439179e+01 + ME 2.009214312595575e+00 + +Event 173 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.227626126251469e+02 -8.611217040357144e+01 5.308520932893802e+02 + 3 7.500000000000003e+02 5.227626126251467e+02 8.611217040357188e+01 -5.308520932893800e+02 + ME 2.500827870542702e+00 + +Event 174 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 7.427899788159027e+02 8.606171690349179e+01 -5.793468955896293e+01 + 3 7.500000000000008e+02 -7.427899788159040e+02 -8.606171690349203e+01 5.793468955896337e+01 + ME 2.008572783324111e+00 + +Event 175 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 1.880595469134288e+02 1.952277576903689e+02 6.992994562002085e+02 + 3 7.500000000000005e+02 -1.880595469134287e+02 -1.952277576903685e+02 -6.992994562002081e+02 + ME 7.837424879675873e+00 + +Event 176 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.956039433245556e+02 6.710800379532824e+02 -1.573845333938319e+02 + 3 7.500000000000003e+02 2.956039433245558e+02 -6.710800379532825e+02 1.573845333938319e+02 + ME 1.995121848065215e+00 + +Event 177 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.644810965455426e+02 -6.588907599528918e+02 2.416872276699125e+02 + 3 7.500000000000003e+02 -2.644810965455426e+02 6.588907599528917e+02 -2.416872276699127e+02 + ME 1.984092858745212e+00 + +Event 178 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.046483864607852e+02 3.100401775516634e+02 3.174514404662581e+02 + 3 7.500000000000002e+02 6.046483864607852e+02 -3.100401775516634e+02 -3.174514404662581e+02 + ME 1.991217628237894e+00 + +Event 179 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999952e+02 -7.554400969691652e+01 -5.342383817568527e+02 -5.209438108440612e+02 + 3 7.499999999999992e+02 7.554400969691555e+01 5.342383817568511e+02 5.209438108440604e+02 + ME 2.436643331539693e+00 + +Event 180 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.109280079817108e+02 7.886792704189078e+01 5.433529430710355e+02 + 3 7.500000000000002e+02 -5.109280079817111e+02 -7.886792704189065e+01 -5.433529430710350e+02 + ME 2.593854265630499e+00 + +Event 181 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -2.142616447442584e+02 -7.154366257261994e+02 -6.886495597177299e+01 + 3 7.500000000000006e+02 2.142616447442587e+02 7.154366257261991e+02 6.886495597177280e+01 + ME 2.007564501650115e+00 + +Event 182 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.851592186609105e+02 1.986891791084640e+02 2.314723637697936e+02 + 3 7.500000000000001e+02 -6.851592186609104e+02 -1.986891791084641e+02 -2.314723637697936e+02 + ME 1.984847553137794e+00 + +Event 183 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000302e+02 -4.900054502464825e+02 -4.951952606757475e+02 -2.778062499891797e+02 + 3 7.500000000000072e+02 4.900054502465236e+02 4.951952606757305e+02 2.778062499891464e+02 + ME 1.984067510574099e+00 + +Event 184 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -4.678382693716270e+02 5.778877958512080e+02 -9.835166047197205e+01 + 3 7.500000000000005e+02 4.678382693716272e+02 -5.778877958512080e+02 9.835166047197220e+01 + ME 2.004127965245699e+00 + +Event 185 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -1.061006737744816e+02 -6.192100761952033e+02 4.096602599263589e+02 + 3 7.499999999999989e+02 1.061006737744816e+02 6.192100761952032e+02 -4.096602599263590e+02 + ME 2.066323292737944e+00 + +Event 186 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.455169979042824e+02 -6.058703167657474e+02 3.676174138996159e+02 + 3 7.500000000000002e+02 -2.455169979042824e+02 6.058703167657475e+02 -3.676174138996160e+02 + ME 2.018319718790237e+00 + +Event 187 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000010e+02 -7.262021511245958e+01 -7.131602101967057e+02 -2.205194298677789e+02 + 3 7.500000000000001e+02 7.262021511245966e+01 7.131602101967057e+02 2.205194298677793e+02 + ME 1.985921774689299e+00 + +Event 188 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000017e+02 -7.363140077196471e+02 -6.013110168472285e+01 1.293287773313052e+02 + 3 7.499999999999994e+02 7.363140077196483e+02 6.013110168472299e+01 -1.293287773313051e+02 + ME 1.999630194484571e+00 + +Event 189 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 4.206549846649757e+02 6.188260461572106e+02 -5.102654675699417e+01 + 3 7.499999999999998e+02 -4.206549846649756e+02 -6.188260461572107e+02 5.102654675699394e+01 + ME 2.009128074801350e+00 + +Event 190 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.604864850049361e+00 4.279484687014978e+01 -7.487570945048690e+02 + 3 7.500000000000005e+02 5.604864850049498e+00 -4.279484687014975e+01 7.487570945048689e+02 + ME 2.560979109157362e+01 + +Event 191 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 3.197160362814988e+01 2.941727699376931e+02 -6.891590512999376e+02 + 3 7.500000000000002e+02 -3.197160362814988e+01 -2.941727699376931e+02 6.891590512999373e+02 + ME 6.822140982496753e+00 + +Event 192 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.806169100853666e+02 -1.212419534166893e+02 -4.590036954694657e+02 + 3 7.499999999999997e+02 -5.806169100853666e+02 1.212419534166893e+02 4.590036954694659e+02 + ME 2.171492035497775e+00 + +Event 193 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -6.456318764132128e+02 1.898225735585407e+02 3.310994876570574e+02 + 3 7.499999999999995e+02 6.456318764132126e+02 -1.898225735585411e+02 -3.310994876570574e+02 + ME 1.996179823211170e+00 + +Event 194 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 6.338176939226099e+02 -1.425870083131823e+02 -3.747586902673792e+02 + 3 7.500000000000002e+02 -6.338176939226099e+02 1.425870083131823e+02 3.747586902673791e+02 + ME 2.024506557702449e+00 + +Event 195 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.241223574217288e+01 6.578529508306934e+02 -3.576740098786882e+02 + 3 7.500000000000006e+02 -4.241223574217268e+01 -6.578529508306935e+02 3.576740098786880e+02 + ME 2.010814173675303e+00 + +Event 196 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.866306133186302e+02 -4.882418460209895e+01 6.913458544798877e+02 + 3 7.499999999999999e+02 -2.866306133186302e+02 4.882418460209892e+01 -6.913458544798877e+02 + ME 7.018637302241416e+00 + +Event 197 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -4.280026621922506e+02 -1.673514311633879e+02 -5.927117508906940e+02 + 3 7.500000000000000e+02 4.280026621922506e+02 1.673514311633879e+02 5.927117508906940e+02 + ME 3.148789429367749e+00 + +Event 198 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -1.361491583779542e+02 5.918825836234865e+02 4.400436499668660e+02 + 3 7.500000000000000e+02 1.361491583779541e+02 -5.918825836234865e+02 -4.400436499668658e+02 + ME 2.123099646243896e+00 + +Event 199 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.634973495764100e+02 2.221316746677182e+01 -3.489668211967611e+02 + 3 7.499999999999998e+02 6.634973495764100e+02 -2.221316746677164e+01 3.489668211967611e+02 + ME 2.005207966091981e+00 + +Event 200 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -8.393568923823409e+01 7.316416839490794e+02 1.419691740493622e+02 + 3 7.500000000000000e+02 8.393568923823405e+01 -7.316416839490794e+02 -1.419691740493622e+02 + ME 1.997625733174078e+00 + +Event 201 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.430360425977639e+02 -7.001150406129030e+02 -1.152059542992963e+02 + 3 7.499999999999997e+02 -2.430360425977634e+02 7.001150406129030e+02 1.152059542992962e+02 + ME 2.001770217517976e+00 + +Event 202 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.592948659363004e+02 -1.261773212911235e+02 3.345288677256897e+02 + 3 7.499999999999999e+02 6.592948659363005e+02 1.261773212911236e+02 -3.345288677256896e+02 + ME 1.997675744803344e+00 + +Event 203 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 6.830488054426544e+02 -6.443348473770897e+01 -3.029730275584196e+02 + 3 7.499999999999976e+02 -6.830488054426536e+02 6.443348473771104e+01 3.029730275584206e+02 + ME 1.987490455239470e+00 + +Event 204 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.581562204420950e+02 4.859410156234014e+02 5.489516729825758e+02 + 3 7.500000000000001e+02 -1.581562204420950e+02 -4.859410156234014e+02 -5.489516729825760e+02 + ME 2.640462426858579e+00 + +Event 205 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000018e+02 4.522293593956092e+02 5.926970499435945e+02 8.184627962711173e+01 + 3 7.499999999999994e+02 -4.522293593956091e+02 -5.926970499435945e+02 -8.184627962711099e+01 + ME 2.006172642011354e+00 + +Event 206 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 5.347787837401266e+02 -3.496041176300519e+02 3.927958927961735e+02 + 3 7.499999999999994e+02 -5.347787837401273e+02 3.496041176300523e+02 -3.927958927961737e+02 + ME 2.043497155295106e+00 + +Event 207 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 3.012154446331053e+02 6.385938504619652e+02 -2.529172790986054e+02 + 3 7.500000000000000e+02 -3.012154446331053e+02 -6.385938504619652e+02 2.529172790986054e+02 + ME 1.983594671806983e+00 + +Event 208 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 7.361476225156937e+02 -7.555700020645612e+01 1.219746595990005e+02 + 3 7.499999999999993e+02 -7.361476225156937e+02 7.555700020645673e+01 -1.219746595989994e+02 + ME 2.000760387842997e+00 + +Event 209 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.682577368640646e+02 -6.981157757233002e+02 5.632184566245705e+01 + 3 7.500000000000002e+02 -2.682577368640645e+02 6.981157757233002e+02 -5.632184566245736e+01 + ME 2.008708228725182e+00 + +Event 210 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -9.656834699970364e+01 -3.286793908889076e+01 7.430304522277629e+02 + 3 7.500000000000000e+02 9.656834699970366e+01 3.286793908889072e+01 -7.430304522277628e+02 + ME 2.051796274876418e+01 + +Event 211 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.319690574526541e+02 -6.682701567195196e+02 2.492094581323018e+02 + 3 7.499999999999999e+02 2.319690574526541e+02 6.682701567195196e+02 -2.492094581323018e+02 + ME 1.983716976680244e+00 + +Event 212 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.134259033134994e+02 3.540686678370770e+02 2.466658460348640e+02 + 3 7.500000000000000e+02 6.134259033134992e+02 -3.540686678370769e+02 -2.466658460348641e+02 + ME 1.983825553445415e+00 + +Event 213 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -5.285016171336246e+02 5.172862825223079e+02 -1.249037333364969e+02 + 3 7.499999999999999e+02 5.285016171336246e+02 -5.172862825223079e+02 1.249037333364970e+02 + ME 2.000314035056867e+00 + +Event 214 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 3.200786657490620e+02 -3.467878641431868e+02 -5.829132225428646e+02 + 3 7.499999999999999e+02 -3.200786657490620e+02 3.467878641431867e+02 5.829132225428646e+02 + ME 3.007924411782479e+00 + +Event 215 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999953e+02 -6.774094555834020e+02 -2.472477212256584e+02 2.061188827713712e+02 + 3 7.499999999999992e+02 6.774094555834032e+02 2.472477212256532e+02 -2.061188827713711e+02 + ME 1.987666589466212e+00 + +Event 216 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000011e+02 -2.582427834329911e+02 2.233184861234906e+02 6.677870308416391e+02 + 3 7.499999999999992e+02 2.582427834329910e+02 -2.233184861234906e+02 -6.677870308416393e+02 + ME 5.356191869677452e+00 + +Event 217 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 4.526164805502724e+02 -1.109167036758217e+02 -5.876527940714437e+02 + 3 7.499999999999997e+02 -4.526164805502727e+02 1.109167036758218e+02 5.876527940714441e+02 + ME 3.073688651172201e+00 + +Event 218 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999842e+02 -2.774671855265902e+02 -6.964182224530531e+02 2.266319463985957e+01 + 3 7.499999999999866e+02 2.774671855265876e+02 6.964182224530413e+02 -2.266319463988859e+01 + ME 2.010698134374191e+00 + +Event 219 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -5.962071114522208e+02 4.057112512865971e+02 -2.059986913387149e+02 + 3 7.500000000000006e+02 5.962071114522209e+02 -4.057112512865967e+02 2.059986913387148e+02 + ME 1.987682472208512e+00 + +Event 220 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.732003838284643e+02 1.672407953016318e+02 4.538412017058155e+02 + 3 7.499999999999999e+02 -5.732003838284643e+02 -1.672407953016320e+02 -4.538412017058155e+02 + ME 2.157161276750328e+00 + +Event 221 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.373739041416514e+01 6.118806829877553e+02 -4.330572318790715e+02 + 3 7.500000000000001e+02 -2.373739041416518e+01 -6.118806829877553e+02 4.330572318790716e+02 + ME 2.107991304156184e+00 + +Event 222 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.926156270671631e+02 -4.951619530302006e+02 4.813426379071443e+02 + 3 7.500000000000000e+02 2.926156270671630e+02 4.951619530302006e+02 -4.813426379071443e+02 + ME 2.245144056797008e+00 + +Event 223 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -1.745329870696012e+02 3.653875131398770e+01 -7.284937584337814e+02 + 3 7.499999999999995e+02 1.745329870696012e+02 -3.653875131398771e+01 7.284937584337814e+02 + ME 1.348601976294947e+01 + +Event 224 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -5.087597823083426e+02 -4.620767648884738e+02 -3.002474766851025e+02 + 3 7.499999999999997e+02 5.087597823083424e+02 4.620767648884733e+02 3.002474766851024e+02 + ME 1.986945509771933e+00 + +Event 225 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000013e+02 -2.331337537494541e+02 -7.096534890457165e+02 -6.738381369388374e+01 + 3 7.499999999999987e+02 2.331337537494539e+02 7.096534890457161e+02 6.738381369388425e+01 + ME 2.007710169037881e+00 + +Event 226 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 2.653645699768242e+02 -5.593089569178837e+02 4.233853277009536e+02 + 3 7.499999999999998e+02 -2.653645699768244e+02 5.593089569178835e+02 -4.233853277009537e+02 + ME 2.089198208343560e+00 + +Event 227 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -1.520233915006850e+02 6.293027014655148e+02 -3.786383477209280e+02 + 3 7.499999999999999e+02 1.520233915006851e+02 -6.293027014655148e+02 3.786383477209282e+02 + ME 2.028168432094755e+00 + +Event 228 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -3.734089345660776e+00 1.222238163786131e+02 7.399644554210732e+02 + 3 7.499999999999997e+02 3.734089345661521e+00 -1.222238163786135e+02 -7.399644554210735e+02 + ME 1.851576164018946e+01 + +Event 229 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -6.305938355568204e+02 2.251166185777011e+02 -3.378963193020605e+02 + 3 7.500000000000007e+02 6.305938355568201e+02 -2.251166185777020e+02 3.378963193020607e+02 + ME 1.999249677967361e+00 + +Event 230 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000066e+02 1.722967433849862e+02 6.011245480280832e+02 -4.140810427645376e+02 + 3 7.500000000000027e+02 -1.722967433849865e+02 -6.011245480280835e+02 4.140810427645375e+02 + ME 2.073237677970927e+00 + +Event 231 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -2.986233395749247e+02 5.694313413368087e+02 -3.860984959365696e+02 + 3 7.499999999999993e+02 2.986233395749253e+02 -5.694313413368089e+02 3.860984959365697e+02 + ME 2.035845382635010e+00 + +Event 232 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999983e+02 1.772789865710537e+02 -7.124844710497254e+02 -1.530948706957047e+02 + 3 7.500000000000007e+02 -1.772789865710537e+02 7.124844710497252e+02 1.530948706957046e+02 + ME 1.995821457072535e+00 + +Event 233 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 3.448110289802655e+02 -6.646149940143566e+02 -4.350016120507395e+01 + 3 7.500000000000002e+02 -3.448110289802656e+02 6.646149940143566e+02 4.350016120507374e+01 + ME 2.009657867600829e+00 + +Event 234 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.161943612943173e+01 -7.278723602156027e+02 1.733126107582698e+02 + 3 7.499999999999995e+02 -5.161943612943171e+01 7.278723602156031e+02 -1.733126107582698e+02 + ME 1.992541381337467e+00 + +Event 235 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 2.147886171723103e+02 -3.086282984760169e+02 6.489332965051219e+02 + 3 7.499999999999995e+02 -2.147886171723102e+02 3.086282984760170e+02 -6.489332965051220e+02 + ME 4.512216218351590e+00 + +Event 236 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.457991029234601e+02 -2.210062269052574e+02 6.732303087882513e+02 + 3 7.500000000000000e+02 2.457991029234601e+02 2.210062269052574e+02 -6.732303087882513e+02 + ME 5.665591081380857e+00 + +Event 237 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.076996593814686e+02 -4.026323435046959e+02 -5.977023067186025e+02 + 3 7.499999999999999e+02 -2.076996593814686e+02 4.026323435046959e+02 5.977023067186025e+02 + ME 3.228270627806512e+00 + +Event 238 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.969032777715727e+02 -4.416010667956443e+02 -1.058063078956158e+02 + 3 7.500000000000001e+02 -5.969032777715726e+02 4.416010667956443e+02 1.058063078956159e+02 + ME 2.003115322790910e+00 + +Event 239 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.188858424502662e+02 7.168183137005395e+02 -2.757703969522984e+01 + 3 7.500000000000000e+02 -2.188858424502661e+02 -7.168183137005394e+02 2.757703969522973e+01 + ME 2.010510537061426e+00 + +Event 240 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 5.906386180567445e+02 -1.043852672012196e+01 -4.621007033320046e+02 + 3 7.500000000000011e+02 -5.906386180567446e+02 1.043852672012131e+01 4.621007033320056e+02 + ME 2.180538801263046e+00 + +Event 241 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 1.310164484338845e+02 -7.028250186529380e+02 -2.266532227771519e+02 + 3 7.500000000000001e+02 -1.310164484338845e+02 7.028250186529380e+02 2.266532227771515e+02 + ME 1.985289439468672e+00 + +Event 242 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.060596880801855e+02 6.058870732428816e+02 3.189487761529025e+02 + 3 7.499999999999997e+02 3.060596880801855e+02 -6.058870732428816e+02 -3.189487761529026e+02 + ME 1.991689118163374e+00 + +Event 243 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.455428611464695e+02 6.177778614269365e-02 -3.818041521596316e+02 + 3 7.500000000000000e+02 -6.455428611464695e+02 -6.177778614266934e-02 3.818041521596317e+02 + ME 2.031321522676738e+00 + +Event 244 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 4.600023964153236e+02 5.665197502294575e+01 -5.896510400382238e+02 + 3 7.500000000000001e+02 -4.600023964153238e+02 -5.665197502294569e+01 5.896510400382238e+02 + ME 3.102724647631286e+00 + +Event 245 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -5.283205132260829e+02 3.195629486835196e+01 -5.313720264775102e+02 + 3 7.500000000000001e+02 5.283205132260829e+02 -3.195629486835195e+01 5.313720264775100e+02 + ME 2.504416662787443e+00 + +Event 246 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.245676679674773e+02 -7.153172514000221e+02 1.976340945196909e+01 + 3 7.500000000000003e+02 2.245676679674772e+02 7.153172514000221e+02 -1.976340945196911e+01 + ME 2.010791836793424e+00 + +Event 247 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.304482834256935e+02 1.189549860529900e+02 7.037352484843176e+02 + 3 7.500000000000000e+02 2.304482834256934e+02 -1.189549860529901e+02 -7.037352484843176e+02 + ME 8.380012727899235e+00 + +Event 248 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000014e+02 2.537801015626409e+02 -6.571572670245547e+02 -2.573713007459760e+02 + 3 7.500000000000005e+02 -2.537801015626402e+02 6.571572670245531e+02 2.573713007459746e+02 + ME 1.983507342724994e+00 + +Event 249 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 2.380072251709859e+02 4.007276857381548e+02 5.875967006794293e+02 + 3 7.500000000000013e+02 -2.380072251709867e+02 -4.007276857381548e+02 -5.875967006794292e+02 + ME 3.072885070943804e+00 + +Event 250 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 6.786028214439425e-01 -3.391021082103954e+02 6.689613589030946e+02 + 3 7.500000000000001e+02 -6.786028214447883e-01 3.391021082103958e+02 -6.689613589030951e+02 + ME 5.419965726617475e+00 + +Event 251 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 8.268187630824573e+01 3.486690416389526e+02 6.588578046382401e+02 + 3 7.499999999999982e+02 -8.268187630824475e+01 -3.486690416389528e+02 -6.588578046382391e+02 + ME 4.918030503810574e+00 + +Event 252 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000020e+02 -4.102134709467123e+02 -2.983471238084353e+02 5.524616746608582e+02 + 3 7.499999999999987e+02 4.102134709467135e+02 2.983471238084358e+02 -5.524616746608599e+02 + ME 2.671405704941876e+00 + +Event 253 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.361744772971164e+02 -1.872893882889482e+02 -7.133576920221781e+02 + 3 7.499999999999997e+02 1.361744772971161e+02 1.872893882889483e+02 7.133576920221780e+02 + ME 9.846978238614682e+00 + +Event 254 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 6.137214690822855e+02 -3.619701676967441e+02 2.341443061128604e+02 + 3 7.500000000000000e+02 -6.137214690822851e+02 3.619701676967441e+02 -2.341443061128601e+02 + ME 1.984625207398940e+00 + +Event 255 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 2.766263610363187e+02 -4.252208793533440e+02 5.524174690773032e+02 + 3 7.500000000000000e+02 -2.766263610363188e+02 4.252208793533439e+02 -5.524174690773031e+02 + ME 2.671007381446374e+00 + +Event 0 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 4.886369700926295e+02 -5.166396019380891e+02 -2.383640769242444e+02 + 3 7.499999999999995e+02 -4.886369700926294e+02 5.166396019380888e+02 2.383640769242449e+02 + ME 1.984309347522146e+00 + +Event 1 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 3.808436077386334e+02 -5.054156008216494e+02 4.025086544295820e+02 + 3 7.500000000000000e+02 -3.808436077386334e+02 5.054156008216497e+02 -4.025086544295819e+02 + ME 2.055984493577888e+00 + +Event 2 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999985e+02 5.183034673675862e+02 4.361721238657036e+02 -3.218934514357475e+02 + 3 7.499999999999985e+02 -5.183034673675862e+02 -4.361721238657033e+02 3.218934514357483e+02 + ME 1.992666939335083e+00 + +Event 3 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -7.346631711305881e+02 1.079089156707178e+02 1.054783906926816e+02 + 3 7.499999999999998e+02 7.346631711305880e+02 -1.079089156707179e+02 -1.054783906926816e+02 + ME 2.003160923858462e+00 + +Event 4 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 -3.860135335017674e+02 -5.185727762898844e+02 -3.802312817805502e+02 + 3 7.499999999999999e+02 3.860135335017676e+02 5.185727762898844e+02 3.802312817805511e+02 + ME 2.029736071461704e+00 + +Event 5 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.412146924854130e+01 6.250106998537266e+02 4.131553586783979e+02 + 3 7.500000000000000e+02 3.412146924854130e+01 -6.250106998537266e+02 -4.131553586783979e+02 + ME 2.071755473651405e+00 + +Event 6 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 5.103283769498017e+02 -1.882131351255163e+02 -5.163726981996678e+02 + 3 7.499999999999985e+02 -5.103283769498009e+02 1.882131351255149e+02 5.163726981996668e+02 + ME 2.409556649220553e+00 + +Event 7 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000011e+02 -5.382967505952907e+02 -1.493237627632333e+02 5.004388296013620e+02 + 3 7.500000000000005e+02 5.382967505952914e+02 1.493237627632330e+02 -5.004388296013620e+02 + ME 2.326090561404597e+00 + +Event 8 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000014e+02 -4.598745101308198e+02 -5.621086252728979e+02 -1.872146584158240e+02 + 3 7.500000000000009e+02 4.598745101308191e+02 5.621086252728995e+02 1.872146584158209e+02 + ME 1.990368021634358e+00 + +Event 9 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.775786374899710e+02 -3.096805779838333e+02 6.595936378762344e+02 + 3 7.499999999999978e+02 -1.775786374899704e+02 3.096805779838338e+02 -6.595936378762349e+02 + ME 4.951273970403449e+00 + +Event 10 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999965e+02 -5.131238447356303e+02 -5.270922542475897e+02 1.462110648204188e+02 + 3 7.499999999999962e+02 5.131238447356279e+02 5.270922542475851e+02 -1.462110648204196e+02 + ME 1.996940667767854e+00 + +Event 11 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.355823051698720e+02 3.562860272236928e+02 -1.777509498149288e+02 + 3 7.499999999999998e+02 6.355823051698721e+02 -3.562860272236929e+02 1.777509498149290e+02 + ME 1.991835916677821e+00 + +Event 12 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.902557542118753e+02 -4.044250918271754e+02 4.966294306674758e+02 + 3 7.499999999999995e+02 3.902557542118754e+02 4.044250918271755e+02 -4.966294306674758e+02 + ME 2.308413684383986e+00 + +Event 13 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.388642282518219e+02 -5.842429835301779e+02 4.050975396832451e+02 + 3 7.499999999999999e+02 -2.388642282518218e+02 5.842429835301775e+02 -4.050975396832447e+02 + ME 2.059610122598538e+00 + +Event 14 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 5.937999534112082e+02 4.550788359673929e+02 5.296100814192651e+01 + 3 7.499999999999998e+02 -5.937999534112082e+02 -4.550788359673929e+02 -5.296100814192653e+01 + ME 2.008979148314806e+00 + +Event 15 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -1.423179755450547e+02 4.493605018245672e+02 -5.833701511362493e+02 + 3 7.500000000000000e+02 1.423179755450545e+02 -4.493605018245675e+02 5.833701511362492e+02 + ME 3.014080798350171e+00 + +Event 16 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -2.706489889553401e+02 1.222987596040279e+02 -6.886887091979718e+02 + 3 7.500000000000003e+02 2.706489889553404e+02 -1.222987596040274e+02 6.886887091979718e+02 + ME 6.781287478663047e+00 + +Event 17 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -4.991385516924362e+02 -3.979472289417108e+02 3.936987543693655e+02 + 3 7.499999999999992e+02 4.991385516924362e+02 3.979472289417105e+02 -3.936987543693655e+02 + ME 2.044586663931617e+00 + +Event 18 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.716904304441446e+02 6.823352520667956e+02 1.519964269054312e+02 + 3 7.499999999999999e+02 -2.716904304441446e+02 -6.823352520667955e+02 -1.519964269054312e+02 + ME 1.996000455344738e+00 + +Event 19 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -4.773498418349229e+02 4.394024782830364e+02 -3.762480439535859e+02 + 3 7.499999999999997e+02 4.773498418349229e+02 -4.394024782830363e+02 3.762480439535859e+02 + ME 2.025886490708682e+00 + +Event 20 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 2.152723736932655e+01 -4.446684930211732e+02 -6.035780888712935e+02 + 3 7.499999999999998e+02 -2.152723736932609e+01 4.446684930211729e+02 6.035780888712937e+02 + ME 3.329386212830298e+00 + +Event 21 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.518768158716146e+02 2.713668944146131e+01 5.979723600333784e+02 + 3 7.500000000000000e+02 4.518768158716145e+02 -2.713668944146123e+01 -5.979723600333785e+02 + ME 3.232733732234588e+00 + +Event 22 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.605342203226538e+02 -5.585006861164979e+02 4.274329229812182e+02 + 3 7.500000000000001e+02 2.605342203226538e+02 5.585006861164979e+02 -4.274329229812183e+02 + ME 2.096776535139024e+00 + +Event 23 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.082800789219980e+02 -4.759337282317115e+02 2.786367666021681e+02 + 3 7.500000000000001e+02 -5.082800789219980e+02 4.759337282317115e+02 -2.786367666021681e+02 + ME 1.984127871373746e+00 + +Event 24 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -4.304443270717347e+02 -6.119700149046336e+02 5.206133065322422e+01 + 3 7.499999999999990e+02 4.304443270717353e+02 6.119700149046336e+02 -5.206133065322406e+01 + ME 2.009049052216004e+00 + +Event 25 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 9.357647372272227e+01 -4.354467328122423e+02 6.034315093270851e+02 + 3 7.499999999999998e+02 -9.357647372272230e+01 4.354467328122424e+02 -6.034315093270851e+02 + ME 3.326758172083423e+00 + +Event 26 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.406043478727808e+02 3.407510258582017e+02 -5.747846697561973e+02 + 3 7.500000000000003e+02 3.406043478727808e+02 -3.407510258582015e+02 5.747846697561973e+02 + ME 2.904517396756161e+00 + +Event 27 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 -3.284333987867702e+02 6.303438323961643e+02 -2.393703313309438e+02 + 3 7.499999999999999e+02 3.284333987867702e+02 -6.303438323961647e+02 2.393703313309439e+02 + ME 1.984240705194220e+00 + +Event 28 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 -1.029133508056653e+02 6.216832636308152e+02 -4.067170539174462e+02 + 3 7.499999999999976e+02 1.029133508056661e+02 -6.216832636308162e+02 4.067170539174470e+02 + ME 2.061944996606000e+00 + +Event 29 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -4.821390153127887e+02 5.538122935981005e+02 1.527544217783090e+02 + 3 7.499999999999990e+02 4.821390153127889e+02 -5.538122935981010e+02 -1.527544217783090e+02 + ME 1.995876946057352e+00 + +Event 30 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 9.489768343899584e+01 -7.281060338646737e+02 1.528268076214603e+02 + 3 7.500000000000006e+02 -9.489768343899605e+01 7.281060338646736e+02 -1.528268076214603e+02 + ME 1.995865148804488e+00 + +Event 31 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000011e+02 2.588924414604490e+02 -6.567632234918367e+02 -2.532523879912314e+02 + 3 7.500000000000003e+02 -2.588924414604486e+02 6.567632234918362e+02 2.532523879912308e+02 + ME 1.983585792596440e+00 + +Event 32 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.671713815281792e+02 4.353863719603355e+02 4.879793885994619e+02 + 3 7.500000000000005e+02 3.671713815281791e+02 -4.353863719603356e+02 -4.879793885994619e+02 + ME 2.271178879084551e+00 + +Event 33 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000023e+02 6.319452257542158e+02 1.146829159774965e+02 -3.872893755699342e+02 + 3 7.499999999999990e+02 -6.319452257542162e+02 -1.146829159774965e+02 3.872893755699342e+02 + ME 2.037151795755364e+00 + +Event 34 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.486096093516040e+02 -5.693192358255851e+02 4.202009874536406e+02 + 3 7.500000000000000e+02 -2.486096093516041e+02 5.693192358255851e+02 -4.202009874536406e+02 + ME 2.083512716248840e+00 + +Event 35 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -7.016777587248104e+02 2.605400911432719e+02 -4.761495372235753e+01 + 3 7.500000000000000e+02 7.016777587248104e+02 -2.605400911432719e+02 4.761495372235753e+01 + ME 2.009378076767459e+00 + +Event 36 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.992190538187328e+02 -1.415759996521825e+02 -4.282438112373230e+02 + 3 7.500000000000001e+02 5.992190538187328e+02 1.415759996521826e+02 4.282438112373230e+02 + ME 2.098343464958287e+00 + +Event 37 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.677951851527304e+02 1.710101924456866e+02 2.954743724361214e+02 + 3 7.499999999999999e+02 -6.677951851527304e+02 -1.710101924456866e+02 -2.954743724361211e+02 + ME 1.986100619014213e+00 + +Event 38 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.262233168639915e+01 -1.242788051966740e+02 7.369757269768426e+02 + 3 7.500000000000003e+02 6.262233168639881e+01 1.242788051966741e+02 -7.369757269768426e+02 + ME 1.689340277486609e+01 + +Event 39 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 6.702492999956165e+02 -1.498752499034606e+01 3.362160762813851e+02 + 3 7.500000000000003e+02 -6.702492999956166e+02 1.498752499034595e+01 -3.362160762813852e+02 + ME 1.998451081732437e+00 + +Event 40 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.153528524047671e+02 -7.173641694168138e+02 -3.888183892327462e+01 + 3 7.499999999999998e+02 2.153528524047672e+02 7.173641694168139e+02 3.888183892327462e+01 + ME 2.009943158764145e+00 + +Event 41 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000041e+02 -1.577959591347038e+02 1.068869602279858e+02 -7.253796337187771e+02 + 3 7.500000000000107e+02 1.577959591347143e+02 -1.068869602279727e+02 7.253796337187816e+02 + ME 1.254378447326961e+01 + +Event 42 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 2.605757710166904e+02 -3.124498327841131e+02 6.300598142654687e+02 + 3 7.500000000000003e+02 -2.605757710166903e+02 3.124498327841133e+02 -6.300598142654685e+02 + ME 3.913466437570163e+00 + +Event 43 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 7.087179265265712e+02 -2.450238073334031e+02 1.349942442185459e+01 + 3 7.500000000000000e+02 -7.087179265265711e+02 2.450238073334030e+02 -1.349942442185452e+01 + ME 2.010950853835399e+00 + +Event 44 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999985e+02 -1.936709096449666e+01 -3.211501049305160e+02 6.774861813282196e+02 + 3 7.499999999999973e+02 1.936709096449596e+01 3.211501049305142e+02 -6.774861813282163e+02 + ME 5.934203466747352e+00 + +Event 45 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -4.123591650584196e+02 -4.263717314995432e+02 -4.589848206317993e+02 + 3 7.500000000000002e+02 4.123591650584197e+02 4.263717314995431e+02 4.589848206317993e+02 + ME 2.171437953450995e+00 + +Event 46 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.417510221679373e+02 1.507214984231037e+02 -5.870596769721773e+02 + 3 7.500000000000000e+02 -4.417510221679373e+02 -1.507214984231037e+02 5.870596769721773e+02 + ME 3.065223124632955e+00 + +Event 47 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000018e+02 3.841729019576124e+02 -2.923622481869862e+02 5.739647177627895e+02 + 3 7.500000000000020e+02 -3.841729019576121e+02 2.923622481869870e+02 -5.739647177627897e+02 + ME 2.894697499858977e+00 + +Event 48 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 5.683917324787872e+02 -4.888657012953391e+02 2.100391741506460e+01 + 3 7.499999999999997e+02 -5.683917324787874e+02 4.888657012953389e+02 -2.100391741506465e+01 + ME 2.010753302031892e+00 + +Event 49 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.514168436731682e+01 -6.826028811438466e+00 7.479390349325276e+02 + 3 7.500000000000003e+02 5.514168436731686e+01 6.826028811438419e+00 -7.479390349325279e+02 + ME 2.474120018640866e+01 + +Event 50 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.984159499903414e+02 -4.045210751108285e+02 -5.566063425640616e+02 + 3 7.500000000000003e+02 2.984159499903414e+02 4.045210751108285e+02 5.566063425640616e+02 + ME 2.709765317407777e+00 + +Event 51 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -7.047061352594473e+02 -2.553121422897803e+02 -2.655132627753894e+01 + 3 7.499999999999998e+02 7.047061352594473e+02 2.553121422897803e+02 2.655132627753903e+01 + ME 2.010552676572688e+00 + +Event 52 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.049072352300011e+02 -2.374327835145617e+02 -6.812772550606136e+02 + 3 7.499999999999998e+02 -2.049072352300011e+02 2.374327835145616e+02 6.812772550606135e+02 + ME 6.196141531373791e+00 + +Event 53 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.762846372615920e+02 -3.488577990178416e+01 6.963833581971029e+02 + 3 7.499999999999997e+02 -2.762846372615919e+02 3.488577990178417e+01 -6.963833581971028e+02 + ME 7.516466508613185e+00 + +Event 54 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -5.965754309649121e+02 2.298482753022433e+02 3.921320256107800e+02 + 3 7.499999999999998e+02 5.965754309649124e+02 -2.298482753022436e+02 -3.921320256107794e+02 + ME 2.042705026930181e+00 + +Event 55 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000010e+02 5.249326727483450e+02 -5.207531728510860e+02 1.255460953068927e+02 + 3 7.499999999999985e+02 -5.249326727483449e+02 5.207531728510860e+02 -1.255460953068926e+02 + ME 2.000215452878511e+00 + +Event 56 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -7.488450735514020e+02 -4.113909777397586e+01 6.215340066190983e+00 + 3 7.500000000000000e+02 7.488450735514019e+02 4.113909777397589e+01 -6.215340066191020e+00 + ME 2.011060688550504e+00 + +Event 57 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.609915062325111e+02 7.065463493516471e+02 1.933235400535789e+02 + 3 7.499999999999999e+02 1.609915062325111e+02 -7.065463493516471e+02 -1.933235400535789e+02 + ME 1.989455455890459e+00 + +Event 58 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 1.364577375600944e+02 -7.175918329662071e+02 1.701212718039087e+02 + 3 7.499999999999997e+02 -1.364577375600944e+02 7.175918329662074e+02 -1.701212718039087e+02 + ME 1.993053556927237e+00 + +Event 59 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.772109336552556e+02 -3.832540253228650e+02 2.871304409891935e+02 + 3 7.500000000000000e+02 5.772109336552556e+02 3.832540253228651e+02 -2.871304409891938e+02 + ME 1.984937386100476e+00 + +Event 60 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.416667813988462e+02 -5.270418494550995e+02 4.098666849251105e+02 + 3 7.499999999999986e+02 3.416667813988453e+02 5.270418494550997e+02 -4.098666849251106e+02 + ME 2.066637019285051e+00 + +Event 61 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -4.259888540845536e+02 -6.048881415507168e+02 -1.230602795667552e+02 + 3 7.500000000000003e+02 4.259888540845541e+02 6.048881415507168e+02 1.230602795667548e+02 + ME 2.000595572517847e+00 + +Event 62 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 4.607995913538437e+02 -5.197207754200592e+02 -2.829382480416942e+02 + 3 7.500000000000006e+02 -4.607995913538437e+02 5.197207754200587e+02 2.829382480416937e+02 + ME 1.984493092557474e+00 + +Event 63 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -1.281800042196733e+02 -1.771681287919233e+02 -7.174129498821534e+02 + 3 7.500000000000002e+02 1.281800042196737e+02 1.771681287919234e+02 7.174129498821534e+02 + ME 1.062295525526163e+01 + +Event 64 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 6.405153128760093e+02 -3.399997059964824e+02 -1.914166499906788e+02 + 3 7.500000000000001e+02 -6.405153128760090e+02 3.399997059964824e+02 1.914166499906790e+02 + ME 1.989736848495627e+00 + +Event 65 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.838295106811601e+02 2.142358614545267e+02 -6.948450954490232e+02 + 3 7.499999999999998e+02 -1.838295106811600e+02 -2.142358614545267e+02 6.948450954490232e+02 + ME 7.357277043327562e+00 + +Event 66 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 5.913261705169194e+02 -1.414127723755366e+02 4.391307184322907e+02 + 3 7.500000000000002e+02 -5.913261705169197e+02 1.414127723755367e+02 -4.391307184322907e+02 + ME 2.121048094879748e+00 + +Event 67 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.975437950835899e+02 -2.686137949018978e+01 -4.524598095400129e+02 + 3 7.500000000000002e+02 5.975437950835899e+02 2.686137949018994e+01 4.524598095400129e+02 + ME 2.153479890398904e+00 + +Event 68 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 5.181330717451524e+02 -5.368392105903471e+02 7.643155066244263e+01 + 3 7.499999999999997e+02 -5.181330717451524e+02 5.368392105903467e+02 -7.643155066244277e+01 + ME 2.006777808229542e+00 + +Event 69 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.851952108095214e+02 3.816438081633558e+02 -6.185068613878011e+02 + 3 7.500000000000001e+02 -1.851952108095216e+02 -3.816438081633559e+02 6.185068613878012e+02 + ME 3.629256087665883e+00 + +Event 70 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.420936051939211e+02 -6.644346036493946e+02 6.323466421748260e+01 + 3 7.500000000000002e+02 -3.420936051939210e+02 6.644346036493946e+02 -6.323466421748285e+01 + ME 2.008103300392967e+00 + +Event 71 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.014973205429060e+02 -5.862298917340072e+02 4.221769106092424e+02 + 3 7.499999999999995e+02 -2.014973205429060e+02 5.862298917340071e+02 -4.221769106092424e+02 + ME 2.087012501014583e+00 + +Event 72 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999983e+02 -5.128995270704372e+02 -5.335011449610839e+02 -1.216988227393155e+02 + 3 7.500000000000007e+02 5.128995270704373e+02 5.335011449610839e+02 1.216988227393155e+02 + ME 2.000802145188290e+00 + +Event 73 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 6.269260124380270e+02 7.328600119139391e+01 -4.050838641046208e+02 + 3 7.499999999999995e+02 -6.269260124380268e+02 -7.328600119139392e+01 4.050838641046204e+02 + ME 2.059590627293980e+00 + +Event 74 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.868628802298931e+02 4.654417523012835e+02 3.815671661692834e+01 + 3 7.500000000000000e+02 -5.868628802298932e+02 -4.654417523012835e+02 -3.815671661692834e+01 + ME 2.009985162172173e+00 + +Event 75 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.396554044798227e+02 3.890541063878783e+02 -4.447320342052838e+01 + 3 7.499999999999999e+02 -6.396554044798226e+02 -3.890541063878784e+02 4.447320342052834e+01 + ME 2.009593868310018e+00 + +Event 76 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 2.541578029035597e+02 6.696747494064764e+02 2.223500421196449e+02 + 3 7.499999999999998e+02 -2.541578029035597e+02 -6.696747494064765e+02 -2.223500421196449e+02 + ME 1.985725329848053e+00 + +Event 77 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -1.794085909550948e+02 -4.035830095427122e+02 -6.061627767357163e+02 + 3 7.500000000000000e+02 1.794085909550948e+02 4.035830095427124e+02 6.061627767357163e+02 + ME 3.376661105012696e+00 + +Event 78 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.251897931439634e+01 5.985924556239656e+02 4.513091707917184e+02 + 3 7.499999999999999e+02 2.251897931439627e+01 -5.985924556239656e+02 -4.513091707917183e+02 + ME 2.150461455825715e+00 + +Event 79 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -8.069660169556643e+01 4.506184873965207e+02 -5.940799923337418e+02 + 3 7.499999999999999e+02 8.069660169556641e+01 -4.506184873965206e+02 5.940799923337419e+02 + ME 3.170027479232333e+00 + +Event 80 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 7.302693508383343e+02 -1.702453183034974e+02 1.494010782754770e+01 + 3 7.500000000000000e+02 -7.302693508383343e+02 1.702453183034974e+02 -1.494010782754776e+01 + ME 2.010919551852894e+00 + +Event 81 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -3.571124181827449e+02 -1.296917603996396e+02 -6.466457825302152e+02 + 3 7.500000000000003e+02 3.571124181827448e+02 1.296917603996393e+02 6.466457825302152e+02 + ME 4.428752434825682e+00 + +Event 82 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -2.079791544122488e+02 6.603817609425097e+02 2.883411194131628e+02 + 3 7.499999999999999e+02 2.079791544122487e+02 -6.603817609425097e+02 -2.883411194131628e+02 + ME 1.985082613506332e+00 + +Event 83 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999991e+02 2.106645025753154e+02 2.591228168950026e+02 6.715473424257536e+02 + 3 7.500000000000000e+02 -2.106645025753154e+02 -2.591228168950025e+02 -6.715473424257551e+02 + ME 5.566077345203682e+00 + +Event 84 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.829033392465455e+02 5.412230867109214e+02 4.353656750988264e+02 + 3 7.499999999999999e+02 -2.829033392465454e+02 -5.412230867109214e+02 -4.353656750988264e+02 + ME 2.112834869994339e+00 + +Event 85 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.452959441952361e+02 3.191655713577259e+02 -5.842979111568145e+02 + 3 7.500000000000001e+02 3.452959441952361e+02 -3.191655713577259e+02 5.842979111568145e+02 + ME 3.026699300876776e+00 + +Event 86 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.315896816222929e+02 -5.262181855033000e+02 3.151583220585842e+02 + 3 7.500000000000000e+02 4.315896816222929e+02 5.262181855033001e+02 -3.151583220585842e+02 + ME 1.990528310218343e+00 + +Event 87 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.811116980879163e+02 -3.427907013503557e+02 -5.474928389013653e+02 + 3 7.500000000000002e+02 -3.811116980879164e+02 3.427907013503558e+02 5.474928389013651e+02 + ME 2.628000046826205e+00 + +Event 88 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.146396045022380e+01 -6.744579332872387e+02 -3.222245766413236e+02 + 3 7.499999999999998e+02 6.146396045022396e+01 6.744579332872382e+02 3.222245766413236e+02 + ME 1.992781166334666e+00 + +Event 89 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000009e+02 -9.261861454486514e+01 7.306822737190313e+02 -1.415104487752590e+02 + 3 7.499999999999997e+02 9.261861454486517e+01 -7.306822737190316e+02 1.415104487752590e+02 + ME 1.997699525087340e+00 + +Event 90 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -4.139376319099176e+02 3.915039386019012e+02 4.877297437598081e+02 + 3 7.499999999999999e+02 4.139376319099175e+02 -3.915039386019013e+02 -4.877297437598081e+02 + ME 2.270161148116216e+00 + +Event 91 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 7.126395371709493e+00 -7.420663234346179e+02 -1.085669660312691e+02 + 3 7.499999999999997e+02 -7.126395371709608e+00 7.420663234346180e+02 1.085669660312693e+02 + ME 2.002727750198704e+00 + +Event 92 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -1.726503622084788e+02 -6.521623274545965e+02 -3.276830039508977e+02 + 3 7.499999999999992e+02 1.726503622084796e+02 6.521623274545966e+02 3.276830039508984e+02 + ME 1.994792831040187e+00 + +Event 93 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 4.209821045541699e+02 -6.195820136229019e+02 -3.731214333339776e+01 + 3 7.500000000000000e+02 -4.209821045541700e+02 6.195820136229019e+02 3.731214333339766e+01 + ME 2.010033124088934e+00 + +Event 94 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999984e+02 6.269909815974254e+02 3.639809310785994e+02 1.920942237722172e+02 + 3 7.499999999999984e+02 -6.269909815974265e+02 -3.639809310786015e+02 -1.920942237722174e+02 + ME 1.989636481366654e+00 + +Event 95 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.553133048499316e+02 5.741086360057838e+02 -1.599658353843466e+02 + 3 7.500000000000000e+02 -4.553133048499316e+02 -5.741086360057838e+02 1.599658353843466e+02 + ME 1.994700919179108e+00 + +Event 96 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.565619044698291e+02 6.422192246258138e+02 -2.902248381786659e+02 + 3 7.499999999999999e+02 -2.565619044698291e+02 -6.422192246258138e+02 2.902248381786659e+02 + ME 1.985324124083277e+00 + +Event 97 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.039638951579397e+02 -2.118244738005755e+02 -5.953516303059346e+02 + 3 7.500000000000000e+02 -4.039638951579398e+02 2.118244738005755e+02 5.953516303059346e+02 + ME 3.190135488781065e+00 + +Event 98 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -2.854056619662841e+02 -6.558450095995934e+02 2.256345086658882e+02 + 3 7.500000000000000e+02 2.854056619662841e+02 6.558450095995934e+02 -2.256345086658882e+02 + ME 1.985389237734055e+00 + +Event 99 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 6.845741170030386e+02 -2.951022215571765e+02 8.229797787022088e+01 + 3 7.499999999999983e+02 -6.845741170030385e+02 2.951022215571757e+02 -8.229797787021991e+01 + ME 2.006120622294754e+00 + +Event 100 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.131199432162236e+02 6.658078482508749e+02 3.261959344539691e+02 + 3 7.499999999999998e+02 -1.131199432162237e+02 -6.658078482508747e+02 -3.261959344539691e+02 + ME 1.994220336919275e+00 + +Event 101 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999974e+02 6.281139753924398e+02 1.799537449267306e+01 -4.094496311068950e+02 + 3 7.499999999999968e+02 -6.281139753924423e+02 -1.799537449266958e+01 4.094496311068941e+02 + ME 2.066004084437084e+00 + +Event 102 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.331822864934145e+02 3.520635566235372e+02 -5.009035423770954e+02 + 3 7.499999999999998e+02 -4.331822864934146e+02 -3.520635566235372e+02 5.009035423770955e+02 + ME 2.328303353515298e+00 + +Event 103 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.092502234787573e+02 -1.620958235289011e+02 -4.062500574591608e+02 + 3 7.499999999999999e+02 -6.092502234787573e+02 1.620958235289010e+02 4.062500574591608e+02 + ME 2.061266375378142e+00 + +Event 104 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -7.390318022514667e+02 7.866020384803387e+01 -1.007202441991433e+02 + 3 7.500000000000001e+02 7.390318022514666e+02 -7.866020384803385e+01 1.007202441991434e+02 + ME 2.003811783293278e+00 + +Event 105 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.116661021739875e+02 -1.805958333876086e+02 -7.193162215523179e+02 + 3 7.500000000000001e+02 1.116661021739881e+02 1.805958333876084e+02 7.193162215523182e+02 + ME 1.102877972363151e+01 + +Event 106 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -6.896019143353706e+02 -3.499565434426603e+01 -2.927874722764089e+02 + 3 7.500000000000000e+02 6.896019143353705e+02 3.499565434426599e+01 2.927874722764089e+02 + ME 1.985683803751423e+00 + +Event 107 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.876020488404372e+02 4.472601133788148e+02 1.310809794702898e+02 + 3 7.500000000000000e+02 5.876020488404371e+02 -4.472601133788147e+02 -1.310809794702898e+02 + ME 1.999356513234769e+00 + +Event 108 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.474590340134895e+02 7.351525828033791e+02 1.750752082190591e+01 + 3 7.499999999999999e+02 1.474590340134896e+02 -7.351525828033792e+02 -1.750752082190613e+01 + ME 2.010855956167804e+00 + +Event 109 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 2.957445751534648e+02 5.717428497601883e+02 -3.848964276469646e+02 + 3 7.499999999999997e+02 -2.957445751534648e+02 -5.717428497601884e+02 3.848964276469649e+02 + ME 2.034549794585122e+00 + +Event 110 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.063365970415883e+00 -3.265074122789853e+02 6.751711970456008e+02 + 3 7.500000000000002e+02 -6.063365970415944e+00 3.265074122789853e+02 -6.751711970456010e+02 + ME 5.784963616984554e+00 + +Event 111 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -7.490455319369717e+02 -3.406507833202949e+01 -1.644267385470533e+01 + 3 7.500000000000010e+02 7.490455319369723e+02 3.406507833202959e+01 1.644267385470358e+01 + ME 2.010883546212678e+00 + +Event 112 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.374207045694743e+02 5.497834357859954e+01 7.093016218641318e+02 + 3 7.500000000000000e+02 -2.374207045694743e+02 -5.497834357859954e+01 -7.093016218641318e+02 + ME 9.172605741240773e+00 + +Event 113 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 5.836102630475973e+01 7.112469905859641e+02 2.306983072999552e+02 + 3 7.500000000000000e+02 -5.836102630475965e+01 -7.112469905859641e+02 -2.306983072999552e+02 + ME 1.984915059360817e+00 + +Event 114 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.832697985127738e+02 -6.792924150459122e+02 1.442914970177566e+02 + 3 7.500000000000001e+02 2.832697985127741e+02 6.792924150459122e+02 -1.442914970177563e+02 + ME 1.997251242389987e+00 + +Event 115 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -3.877068793675994e+02 -5.687932010880680e+02 2.977543787873256e+02 + 3 7.499999999999999e+02 3.877068793675994e+02 5.687932010880680e+02 -2.977543787873257e+02 + ME 1.986487190210642e+00 + +Event 116 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.888638462220085e+02 -2.429234055910374e+02 -6.481094794568894e+02 + 3 7.499999999999998e+02 2.888638462220085e+02 2.429234055910374e+02 6.481094794568892e+02 + ME 4.481760920260451e+00 + +Event 117 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.057325810417608e+02 -2.863650344598338e+02 -6.850661926741573e+02 + 3 7.500000000000002e+02 1.057325810417607e+02 2.863650344598338e+02 6.850661926741573e+02 + ME 6.482158221852050e+00 + +Event 118 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.434912620455068e+02 4.133496415655201e+02 -5.231212351989174e+02 + 3 7.500000000000001e+02 3.434912620455068e+02 -4.133496415655201e+02 5.231212351989174e+02 + ME 2.450087544872921e+00 + +Event 119 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 5.337372537382147e+02 -1.304062014214320e+00 5.268992725315253e+02 + 3 7.499999999999994e+02 -5.337372537382158e+02 1.304062014214380e+00 -5.268992725315243e+02 + ME 2.474282470920531e+00 + +Event 120 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000020e+02 2.526044577837270e+02 -5.022225052464477e+02 4.964509473571084e+02 + 3 7.499999999999982e+02 -2.526044577837286e+02 5.022225052464470e+02 -4.964509473571077e+02 + ME 2.307605300736761e+00 + +Event 121 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -9.983643901956184e+01 7.430481637446572e+02 2.030053692852349e+01 + 3 7.500000000000001e+02 9.983643901956184e+01 -7.430481637446572e+02 -2.030053692852350e+01 + ME 2.010775436568609e+00 + +Event 122 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.972556630245654e+02 1.216831948271524e+02 -2.480478592466205e+02 + 3 7.499999999999999e+02 -6.972556630245654e+02 -1.216831948271524e+02 2.480478592466204e+02 + ME 1.983764132347880e+00 + +Event 123 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999990e+02 -6.140782040297679e+02 4.305900507141508e+02 4.093428375274711e-01 + 3 7.499999999999983e+02 6.140782040297669e+02 -4.305900507141531e+02 -4.093428375271901e-01 + ME 2.011090139533466e+00 + +Event 124 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999911e+02 3.588901340872197e+01 -1.799600883903310e+02 -7.272044728293959e+02 + 3 7.499999999999977e+02 -3.588901340872244e+01 1.799600883903320e+02 7.272044728294055e+02 + ME 1.308008567657206e+01 + +Event 125 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 7.215024155075264e+02 1.793419884871221e+02 9.884693006001493e+01 + 3 7.499999999999999e+02 -7.215024155075264e+02 -1.793419884871222e+02 -9.884693006001490e+01 + ME 2.004062293875886e+00 + +Event 126 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 1.281551412895951e+02 -2.336695998906990e+02 -7.010526213116709e+02 + 3 7.500000000000001e+02 -1.281551412895947e+02 2.336695998906987e+02 7.010526213116710e+02 + ME 8.043521587411112e+00 + +Event 127 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.957501429351264e+02 4.658358804290751e+02 -5.542371460356527e+02 + 3 7.500000000000011e+02 1.957501429351260e+02 -4.658358804290745e+02 5.542371460356528e+02 + ME 2.687590346933034e+00 + +Event 128 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.318760792775902e+01 -6.964451407927102e+02 2.780112459296673e+02 + 3 7.500000000000003e+02 -1.318760792775899e+01 6.964451407927103e+02 -2.780112459296673e+02 + ME 1.984082110439539e+00 + +Event 129 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.550350801573031e+02 -6.509004311564038e+02 3.388107918013039e+02 + 3 7.500000000000002e+02 -1.550350801573031e+02 6.509004311564037e+02 -3.388107918013039e+02 + ME 1.999695588062834e+00 + +Event 130 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -1.645749092425722e+02 -7.293083567438612e+02 5.936682601728498e+01 + 3 7.499999999999986e+02 1.645749092425725e+02 7.293083567438607e+02 -5.936682601728561e+01 + ME 2.008449590658919e+00 + +Event 131 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.289732569218596e+02 -7.242036888606353e+02 -1.462700107322627e+02 + 3 7.500000000000002e+02 -1.289732569218595e+02 7.242036888606352e+02 1.462700107322627e+02 + ME 1.996931117530840e+00 + +Event 132 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999991e+02 6.604855820021804e+01 3.651487935846834e+02 -6.517698570073040e+02 + 3 7.499999999999978e+02 -6.604855820021821e+01 -3.651487935846866e+02 6.517698570073051e+02 + ME 4.620643930585398e+00 + +Event 133 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.812417328859055e+02 -6.435356503900175e+02 5.492365413234045e+01 + 3 7.500000000000000e+02 3.812417328859055e+02 6.435356503900175e+02 -5.492365413234045e+01 + ME 2.008822805330303e+00 + +Event 134 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -6.567102166802744e+02 -3.252202169638837e+02 1.595728729631448e+02 + 3 7.499999999999997e+02 6.567102166802742e+02 3.252202169638837e+02 -1.595728729631451e+02 + ME 1.994764975698830e+00 + +Event 135 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.837316099570886e+02 -6.854901002516958e+02 1.099986180280411e+02 + 3 7.500000000000001e+02 2.837316099570886e+02 6.854901002516958e+02 -1.099986180280411e+02 + ME 2.002524237018472e+00 + +Event 136 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -1.445964163493952e+02 -3.365935275091168e+02 6.544437895021098e+02 + 3 7.499999999999998e+02 1.445964163493952e+02 3.365935275091168e+02 -6.544437895021098e+02 + ME 4.728186824157167e+00 + +Event 137 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999986e+02 2.788569431822099e+02 -2.310081397848981e+02 6.567907159759458e+02 + 3 7.499999999999999e+02 -2.788569431822102e+02 2.310081397848975e+02 -6.567907159759453e+02 + ME 4.827133279595881e+00 + +Event 138 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.519171223259784e+02 5.310903622506274e+02 -3.957238508585245e+02 + 3 7.500000000000000e+02 -3.519171223259784e+02 -5.310903622506274e+02 3.957238508585245e+02 + ME 2.047082251362772e+00 + +Event 139 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.739397549265399e+02 -2.618238911224230e+02 -5.950775661399049e+02 + 3 7.500000000000000e+02 -3.739397549265399e+02 2.618238911224231e+02 5.950775661399049e+02 + ME 3.185771296827542e+00 + +Event 140 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.980454399514575e+02 1.394595291726036e+02 -2.361855276809563e+02 + 3 7.500000000000002e+02 -6.980454399514578e+02 -1.394595291726037e+02 2.361855276809564e+02 + ME 1.984466872167745e+00 + +Event 141 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 4.653840213895683e+02 -2.880922628106119e+01 5.874416916736162e+02 + 3 7.500000000000005e+02 -4.653840213895683e+02 2.880922628106116e+01 -5.874416916736163e+02 + ME 3.070667674723962e+00 + +Event 142 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 2.270562981552866e+02 -5.222032422100018e+02 4.881077865527037e+02 + 3 7.500000000000000e+02 -2.270562981552866e+02 5.222032422100018e+02 -4.881077865527037e+02 + ME 2.271703521969705e+00 + +Event 143 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 4.278386633790234e+02 5.011307304374201e+02 3.582206989124277e+02 + 3 7.500000000000007e+02 -4.278386633790235e+02 -5.011307304374207e+02 -3.582206989124284e+02 + ME 2.011195243007343e+00 + +Event 144 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.485631413852261e+02 -2.868202125643110e+02 -2.441311477486854e+02 + 3 7.499999999999998e+02 6.485631413852262e+02 2.868202125643110e+02 2.441311477486855e+02 + ME 1.983952824756691e+00 + +Event 145 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.489272678263819e+02 -2.043717898946646e-02 5.110566056896098e+02 + 3 7.500000000000000e+02 -5.489272678263819e+02 2.043717898953842e-02 -5.110566056896097e+02 + ME 2.379895727846479e+00 + +Event 146 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.879378412153704e+02 -6.925213285531936e+02 2.451345579835954e+00 + 3 7.500000000000000e+02 2.879378412153704e+02 6.925213285531936e+02 -2.451345579835899e+00 + ME 2.011085665785286e+00 + +Event 147 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.952915705099506e+02 1.689784617132080e+02 -2.247574502766133e+02 + 3 7.500000000000000e+02 -6.952915705099505e+02 -1.689784617132080e+02 2.247574502766133e+02 + ME 1.985476870550384e+00 + +Event 148 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.957082656749750e+02 1.510461070560446e+02 2.359556793440931e+02 + 3 7.499999999999999e+02 6.957082656749749e+02 -1.510461070560447e+02 -2.359556793440931e+02 + ME 1.984484189288116e+00 + +Event 149 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.906011724554260e+02 -4.023428219257597e+02 4.980471641534193e+02 + 3 7.500000000000000e+02 3.906011724554261e+02 4.023428219257598e+02 -4.980471641534193e+02 + ME 2.314897321112815e+00 + +Event 150 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000017e+02 3.825548542963734e+02 3.607661939729296e+02 -5.347892451616488e+02 + 3 7.500000000000000e+02 -3.825548542963752e+02 -3.607661939729286e+02 5.347892451616495e+02 + ME 2.528585842387213e+00 + +Event 151 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -8.096198305186688e+01 -6.966591351540006e+02 -2.657276927736156e+02 + 3 7.500000000000000e+02 8.096198305186688e+01 6.966591351540006e+02 2.657276927736156e+02 + ME 1.983534026450009e+00 + +Event 152 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -3.053425729950391e+02 6.849509469742742e+02 1.039775724928324e+01 + 3 7.499999999999995e+02 3.053425729950393e+02 -6.849509469742737e+02 -1.039775724928318e+01 + ME 2.011007520774094e+00 + +Event 153 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -9.852328869759022e+01 -1.550106591050277e+02 7.271621945261154e+02 + 3 7.500000000000001e+02 9.852328869759016e+01 1.550106591050275e+02 -7.271621945261154e+02 + ME 1.306716793396641e+01 + +Event 154 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.148528978665797e+01 -7.103339962655652e+02 2.386090741703716e+02 + 3 7.499999999999999e+02 3.148528978665796e+01 7.103339962655652e+02 -2.386090741703715e+02 + ME 1.984292391971781e+00 + +Event 155 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.874782580333551e+02 2.540696668148104e+02 -5.102093220918027e+02 + 3 7.500000000000001e+02 4.874782580333551e+02 -2.540696668148104e+02 5.102093220918026e+02 + ME 2.375343013454757e+00 + +Event 156 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 5.351140291898049e+02 3.025756173962970e+02 4.296521517710606e+02 + 3 7.500000000000002e+02 -5.351140291898049e+02 -3.025756173962971e+02 -4.296521517710606e+02 + ME 2.101104406431960e+00 + +Event 157 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.236396872289433e+01 2.448362281552735e+02 -7.088034661738875e+02 + 3 7.499999999999993e+02 -1.236396872289407e+01 -2.448362281552726e+02 7.088034661738876e+02 + ME 9.095855285444577e+00 + +Event 158 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 4.792285853193343e+02 5.225230422173582e+02 2.445600812985376e+02 + 3 7.500000000000003e+02 -4.792285853193339e+02 -5.225230422173579e+02 -2.445600812985381e+02 + ME 1.983929981579759e+00 + +Event 159 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.994239737611111e+02 4.487043427103748e+02 4.307333877573547e+01 + 3 7.499999999999999e+02 5.994239737611113e+02 -4.487043427103749e+02 -4.307333877573552e+01 + ME 2.009685515458492e+00 + +Event 160 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.056348972134737e+00 -5.530529151025127e+02 -5.065879560584343e+02 + 3 7.500000000000001e+02 1.056348972134632e+00 5.530529151025127e+02 5.065879560584343e+02 + ME 2.356403770495644e+00 + +Event 161 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.175060175378059e+02 -6.201939914735475e+02 5.956625101373001e+01 + 3 7.499999999999997e+02 4.175060175378057e+02 6.201939914735476e+02 -5.956625101373005e+01 + ME 2.008432219179860e+00 + +Event 162 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000131e+02 -3.532767836309999e+02 -6.048160389091212e+02 2.681288369899751e+02 + 3 7.499999999999952e+02 3.532767836310093e+02 6.048160389091256e+02 -2.681288369899763e+02 + ME 1.983591149479119e+00 + +Event 163 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 3.455155148577347e+01 -5.419588320322101e+02 5.172879417425158e+02 + 3 7.499999999999998e+02 -3.455155148577347e+01 5.419588320322102e+02 -5.172879417425160e+02 + ME 2.414859767910284e+00 + +Event 164 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 1.203138720765576e+02 6.593334144494559e+02 3.366066261623008e+02 + 3 7.500000000000002e+02 -1.203138720765575e+02 -6.593334144494565e+02 -3.366066261623006e+02 + ME 1.998634330981031e+00 + +Event 165 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.816823713406797e+02 -1.602945128408126e+02 7.097867180337724e+02 + 3 7.500000000000001e+02 1.816823713406797e+02 1.602945128408125e+02 -7.097867180337724e+02 + ME 9.248548838925846e+00 + +Event 166 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -4.857732699129311e+01 -2.617520387305624e+02 7.011605476085883e+02 + 3 7.500000000000001e+02 4.857732699129315e+01 2.617520387305623e+02 -7.011605476085884e+02 + ME 8.056552904397961e+00 + +Event 167 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -4.900092704286321e+01 -6.320471173340015e+02 4.007684501288983e+02 + 3 7.500000000000000e+02 4.900092704286319e+01 6.320471173340014e+02 -4.007684501288983e+02 + ME 2.053619707066993e+00 + +Event 168 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.264562441053386e+02 -1.340492719285199e+01 -5.340076132130089e+02 + 3 7.500000000000000e+02 -5.264562441053386e+02 1.340492719285199e+01 5.340076132130089e+02 + ME 2.522967143225648e+00 + +Event 169 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.804167788417345e+02 6.461491047275698e+02 1.656529008013751e+01 + 3 7.500000000000001e+02 3.804167788417345e+02 -6.461491047275698e+02 -1.656529008013748e+01 + ME 2.010880456803565e+00 + +Event 170 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -8.097738208309649e+01 -3.146836244882357e+02 6.759562708266826e+02 + 3 7.500000000000000e+02 8.097738208309650e+01 3.146836244882357e+02 -6.759562708266826e+02 + ME 5.834715064701833e+00 + +Event 171 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -7.540116185339261e+01 -3.719303863809235e+02 -6.469021969955712e+02 + 3 7.499999999999987e+02 7.540116185339261e+01 3.719303863809236e+02 6.469021969955708e+02 + ME 4.437938361437913e+00 + +Event 172 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999991e+02 -4.828016172825367e+02 -5.140078459318597e+02 -2.553400334257376e+02 + 3 7.499999999999998e+02 4.828016172825360e+02 5.140078459318594e+02 2.553400334257384e+02 + ME 1.983538851321157e+00 + +Event 173 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.987927902937172e+01 -2.766666825658829e+01 7.462247930951604e+02 + 3 7.499999999999999e+02 -6.987927902937163e+01 2.766666825658830e+01 -7.462247930951604e+02 + ME 2.309142129384764e+01 + +Event 174 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 4.529896621351459e+02 -2.479499892141398e+02 5.438944464208055e+02 + 3 7.500000000000001e+02 -4.529896621351459e+02 2.479499892141399e+02 -5.438944464208055e+02 + ME 2.598220410305965e+00 + +Event 175 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 1.641260263514688e+02 6.805045353716746e+02 -2.692140873220751e+02 + 3 7.499999999999992e+02 -1.641260263514683e+02 -6.805045353716744e+02 2.692140873220759e+02 + ME 1.983624622951714e+00 + +Event 176 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -4.449083452207172e+02 3.512022859276666e+02 4.911349292324311e+02 + 3 7.500000000000003e+02 4.449083452207170e+02 -3.512022859276664e+02 -4.911349292324313e+02 + ME 2.284312202099852e+00 + +Event 177 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 6.756184157382538e+02 2.756014812918595e+02 1.734461871159609e+02 + 3 7.499999999999999e+02 -6.756184157382538e+02 -2.756014812918595e+02 -1.734461871159609e+02 + ME 1.992520025439582e+00 + +Event 178 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -7.899028545581659e+01 6.651246644987089e+02 3.374458704430804e+02 + 3 7.499999999999997e+02 7.899028545581672e+01 -6.651246644987090e+02 -3.374458704430803e+02 + ME 1.999032967119208e+00 + +Event 179 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000032e+02 -5.649043925133968e+02 -4.752925559656212e+02 1.322120023399303e+02 + 3 7.500000000000028e+02 5.649043925133992e+02 4.752925559656279e+02 -1.322120023399242e+02 + ME 1.999179049147394e+00 + +Event 180 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 6.443749249963931e+02 -3.789807589401902e+02 6.045279471629055e+01 + 3 7.499999999999998e+02 -6.443749249963931e+02 3.789807589401904e+02 -6.045279471629056e+01 + ME 2.008354355265392e+00 + +Event 181 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -7.487240721466687e+02 -4.365618810460560e+01 2.530025352630979e+00 + 3 7.499999999999992e+02 7.487240721466690e+02 4.365618810460511e+01 -2.530025352631356e+00 + ME 2.011085365634105e+00 + +Event 182 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -1.736127789510566e+02 -4.849363338101804e+02 5.451562667122458e+02 + 3 7.500000000000002e+02 1.736127789510568e+02 4.849363338101802e+02 -5.451562667122456e+02 + ME 2.608510167984170e+00 + +Event 183 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -1.465394368041592e+02 7.111974597422168e+02 1.876815566786921e+02 + 3 7.499999999999994e+02 1.465394368041591e+02 -7.111974597422169e+02 -1.876815566786921e+02 + ME 1.990297183267100e+00 + +Event 184 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000016e+02 -1.708920160408793e+02 -6.330544605807507e+02 3.640576448755041e+02 + 3 7.500000000000007e+02 1.708920160408793e+02 6.330544605807504e+02 -3.640576448755053e+02 + ME 2.015490029126271e+00 + +Event 185 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.460547321795563e+02 -2.478493945403630e+02 5.496415682298780e+02 + 3 7.499999999999998e+02 4.460547321795562e+02 2.478493945403631e+02 -5.496415682298780e+02 + ME 2.646436363519030e+00 + +Event 186 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.118512395965141e+01 5.909417855163442e+02 4.613447698344142e+02 + 3 7.499999999999989e+02 2.118512395965102e+01 -5.909417855163435e+02 -4.613447698344139e+02 + ME 2.178298627452733e+00 + +Event 187 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.653808167750950e+02 -7.001457156827445e+02 -4.323191990741312e+01 + 3 7.500000000000000e+02 -2.653808167750950e+02 7.001457156827445e+02 4.323191990741302e+01 + ME 2.009675273524294e+00 + +Event 188 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 6.408357091392249e+02 3.883393337910243e+02 -3.197117018143499e+01 + 3 7.500000000000001e+02 -6.408357091392249e+02 -3.883393337910244e+02 3.197117018143511e+01 + ME 2.010312327528764e+00 + +Event 189 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000103e+02 6.049264826447564e+01 2.924911886358318e+01 7.469840216173631e+02 + 3 7.500000000000013e+02 -6.049264826448120e+01 -2.924911886358490e+01 -7.469840216173664e+02 + ME 2.379552267034401e+01 + +Event 190 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -3.303745399677809e+02 -6.726435165214554e+02 -3.005599811564162e+01 + 3 7.499999999999998e+02 3.303745399677807e+02 6.726435165214554e+02 3.005599811564161e+01 + ME 2.010402233084874e+00 + +Event 191 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000059e+02 2.507415243158216e+02 -6.861277073704366e+02 1.698748279244940e+02 + 3 7.500000000000019e+02 -2.507415243158149e+02 6.861277073704406e+02 -1.698748279244789e+02 + ME 1.993093251791711e+00 + +Event 192 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.280906177080156e+02 -4.243783337877665e+02 -5.241751390360782e+02 + 3 7.500000000000001e+02 3.280906177080155e+02 4.243783337877666e+02 5.241751390360782e+02 + ME 2.456724540584341e+00 + +Event 193 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.030479677918753e+02 3.086657427947369e+02 6.126886553888426e+02 + 3 7.500000000000000e+02 3.030479677918753e+02 -3.086657427947368e+02 -6.126886553888426e+02 + ME 3.504352033847108e+00 + +Event 194 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.646170069729470e+02 3.428856457724426e+02 -6.122967201805791e+02 + 3 7.500000000000000e+02 2.646170069729470e+02 -3.428856457724426e+02 6.122967201805791e+02 + ME 3.496327512126989e+00 + +Event 195 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.753494363932361e+02 4.203961533313043e+02 -2.339660238287344e+02 + 3 7.499999999999999e+02 5.753494363932361e+02 -4.203961533313043e+02 2.339660238287342e+02 + ME 1.984639517527489e+00 + +Event 196 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -4.281251117808167e+02 -5.086770881665589e+02 3.470684494981563e+02 + 3 7.499999999999999e+02 4.281251117808166e+02 5.086770881665594e+02 -3.470684494981560e+02 + ME 2.004096818030677e+00 + +Event 197 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999991e+02 -8.339544953992255e+01 -6.784858681092462e+02 -3.085484172250403e+02 + 3 7.500000000000001e+02 8.339544953992252e+01 6.784858681092462e+02 3.085484172250404e+02 + ME 1.988754370649548e+00 + +Event 198 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 -8.306003478718596e+01 -8.401139414697868e+00 7.453391519822948e+02 + 3 7.499999999999989e+02 8.306003478718694e+01 8.401139414698259e+00 -7.453391519822936e+02 + ME 2.231860287132694e+01 + +Event 199 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.019646293152730e+02 3.128355820782189e+02 -6.510331689640138e+02 + 3 7.499999999999998e+02 2.019646293152729e+02 -3.128355820782187e+02 6.510331689640138e+02 + ME 4.591940058674536e+00 + +Event 200 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 6.426710110696489e+01 3.020308647655600e+02 -6.834815991994778e+02 + 3 7.500000000000000e+02 -6.426710110696466e+01 -3.020308647655600e+02 6.834815991994776e+02 + ME 6.359396539868402e+00 + +Event 201 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 4.022805093987975e+02 -5.266311217523920e+02 -3.511837885775218e+02 + 3 7.500000000000000e+02 -4.022805093987974e+02 5.266311217523918e+02 3.511837885775218e+02 + ME 2.006554767315016e+00 + +Event 202 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.007651650396935e+02 -3.709039186002585e+02 5.783343312182138e+02 + 3 7.500000000000001e+02 3.007651650396937e+02 3.709039186002584e+02 -5.783343312182138e+02 + ME 2.948288961372177e+00 + +Event 203 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.363438797848468e+01 7.118317541034084e+01 -7.458563423902847e+02 + 3 7.500000000000001e+02 3.363438797848469e+01 -7.118317541034084e+01 7.458563423902847e+02 + ME 2.276381800960898e+01 + +Event 204 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -7.528559343153034e+01 -6.321297640200866e+02 -3.965400873323798e+02 + 3 7.499999999999998e+02 7.528559343153087e+01 6.321297640200864e+02 3.965400873323802e+02 + ME 2.048108674862614e+00 + +Event 205 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -2.566959580289875e+02 5.032495686744614e+02 4.933021961845994e+02 + 3 7.500000000000003e+02 2.566959580289877e+02 -5.032495686744614e+02 -4.933021961845996e+02 + ME 2.293627718611544e+00 + +Event 206 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999986e+02 -5.149079042091845e+02 5.440418782066432e+02 3.725969592322746e+01 + 3 7.500000000000009e+02 5.149079042091842e+02 -5.440418782066431e+02 -3.725969592322733e+01 + ME 2.010036068339084e+00 + +Event 207 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 1.433423326606743e+02 7.409449380754376e+01 7.324363328336467e+02 + 3 7.499999999999998e+02 -1.433423326606745e+02 -7.409449380754376e+01 -7.324363328336465e+02 + ME 1.489028843267540e+01 + +Event 208 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.842573371112484e+01 7.342148980903337e+02 1.481618925735034e+02 + 3 7.499999999999995e+02 -3.842573371112481e+01 -7.342148980903333e+02 -1.481618925735034e+02 + ME 1.996624231741946e+00 + +Event 209 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999987e+02 6.901829459970380e+02 -7.313618628183488e+01 2.842509442570104e+02 + 3 7.500000000000003e+02 -6.901829459970360e+02 7.313618628183590e+01 -2.842509442570093e+02 + ME 1.984622606725810e+00 + +Event 210 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999987e+02 5.714997395800420e+02 4.794362694683772e+02 7.764606350735806e+01 + 3 7.499999999999993e+02 -5.714997395800417e+02 -4.794362694683778e+02 -7.764606350735664e+01 + ME 2.006645066069941e+00 + +Event 211 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -1.281705743857859e+02 6.553011438753589e+01 -7.360557777573476e+02 + 3 7.500000000000000e+02 1.281705743857858e+02 -6.553011438753575e+01 7.360557777573475e+02 + ME 1.644725159966691e+01 + +Event 212 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 -3.302472484430331e+02 -5.946768960171297e+02 3.159052773209696e+02 + 3 7.499999999999965e+02 3.302472484430314e+02 5.946768960171261e+02 -3.159052773209676e+02 + ME 1.990748546221434e+00 + +Event 213 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.589381160217067e+02 4.877683676965268e+02 -1.103186381930632e+02 + 3 7.500000000000047e+02 -5.589381160217123e+02 -4.877683676965237e+02 1.103186381930646e+02 + ME 2.002478515476537e+00 + +Event 214 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000036e+02 -5.489994742235585e+02 5.022963490456322e+02 9.379741487743676e+01 + 3 7.499999999999898e+02 5.489994742235668e+02 -5.022963490456343e+02 -9.379741487743821e+01 + ME 2.004720570188235e+00 + +Event 215 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 5.451972808110462e+02 -4.842724140540300e+02 1.753287026773505e+02 + 3 7.499999999999995e+02 -5.451972808110462e+02 4.842724140540300e+02 -1.753287026773504e+02 + ME 1.992219833660235e+00 + +Event 216 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.351512110674362e+02 -6.344873209566202e+02 -2.181731084834278e+02 + 3 7.500000000000000e+02 3.351512110674362e+02 6.344873209566204e+02 2.181731084834278e+02 + ME 1.986182644490094e+00 + +Event 217 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000055e+02 2.256146404784491e+02 -3.672666490292709e+02 6.137696982684937e+02 + 3 7.500000000000331e+02 -2.256146404784360e+02 3.672666490292730e+02 -6.137696982684606e+02 + ME 3.526732459702814e+00 + +Event 218 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.145068310337325e+02 3.208524589836873e+02 -4.414027870140238e+02 + 3 7.500000000000002e+02 -5.145068310337326e+02 -3.208524589836873e+02 4.414027870140238e+02 + ME 2.126198299050428e+00 + +Event 219 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.924092255386964e+02 2.606260561841842e+02 1.230590233456330e+02 + 3 7.500000000000002e+02 6.924092255386963e+02 -2.606260561841842e+02 -1.230590233456330e+02 + ME 2.000595763661263e+00 + +Event 220 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000059e+02 1.410851717299414e+02 -1.722511762646131e+00 -7.366084490902573e+02 + 3 7.499999999999997e+02 -1.410851717299427e+02 1.722511762645365e+00 7.366084490902609e+02 + ME 1.671255294535188e+01 + +Event 221 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -4.216972323744531e+02 -5.474264157117908e+02 2.915403292662405e+02 + 3 7.499999999999997e+02 4.216972323744527e+02 5.474264157117910e+02 -2.915403292662398e+02 + ME 1.985504218638223e+00 + +Event 222 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -5.703291911524502e+02 2.159931833959635e+02 4.365450245345482e+02 + 3 7.500000000000001e+02 5.703291911524508e+02 -2.159931833959635e+02 -4.365450245345479e+02 + ME 2.115365223704068e+00 + +Event 223 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -4.653699780465835e+02 5.516351728100433e+02 -2.040328886521882e+02 + 3 7.499999999999999e+02 4.653699780465836e+02 -5.516351728100433e+02 2.040328886521882e+02 + ME 1.987944959130031e+00 + +Event 224 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.340910046878815e+02 -5.773211279348985e+02 4.176143158037027e+02 + 3 7.499999999999998e+02 2.340910046878812e+02 5.773211279348985e+02 -4.176143158037027e+02 + ME 2.079067375569363e+00 + +Event 225 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.759434290522928e+02 6.827868599275704e+02 2.556286636579899e+02 + 3 7.500000000000007e+02 -1.759434290522934e+02 -6.827868599275705e+02 -2.556286636579902e+02 + ME 1.983533512495551e+00 + +Event 226 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.045452037729306e+02 -4.263792434151545e+02 1.233930199803324e+02 + 3 7.500000000000003e+02 6.045452037729307e+02 4.263792434151545e+02 -1.233930199803322e+02 + ME 2.000544909078705e+00 + +Event 227 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.413532699307888e+02 4.584643726767907e+02 -2.434071817373498e+02 + 3 7.499999999999997e+02 -5.413532699307889e+02 -4.584643726767907e+02 2.434071817373500e+02 + ME 1.983992565736947e+00 + +Event 228 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.451497175256272e+01 -1.575609542562009e+02 -7.331192681200635e+02 + 3 7.499999999999998e+02 -1.451497175256276e+01 1.575609542562010e+02 7.331192681200635e+02 + ME 1.516217829504346e+01 + +Event 229 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.976075277594232e+02 5.994840638922487e+02 2.121959259935821e+02 + 3 7.500000000000007e+02 3.976075277594233e+02 -5.994840638922487e+02 -2.121959259935820e+02 + ME 1.986890198024315e+00 + +Event 230 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.454994669319925e+02 6.424027082758220e+02 -1.744960708715188e+02 + 3 7.500000000000001e+02 -3.454994669319923e+02 -6.424027082758221e+02 1.744960708715187e+02 + ME 1.992352423518112e+00 + +Event 231 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.709700188990096e+01 -9.588670170606436e+01 -7.416506406733005e+02 + 3 7.500000000000001e+02 -5.709700188990106e+01 9.588670170606436e+01 7.416506406733004e+02 + ME 1.956832032390851e+01 + +Event 232 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -6.095841334961023e+02 1.049630507684877e+02 4.241343444749285e+02 + 3 7.499999999999999e+02 6.095841334961023e+02 -1.049630507684882e+02 -4.241343444749288e+02 + ME 2.090570503289665e+00 + +Event 233 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.540619742696694e+02 6.247200054612367e+02 2.164833322747417e+02 + 3 7.500000000000000e+02 -3.540619742696694e+02 -6.247200054612367e+02 -2.164833322747417e+02 + ME 1.986376590203003e+00 + +Event 234 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 5.021981849113407e+02 4.737027333003317e+02 -2.930916299308989e+02 + 3 7.500000000000007e+02 -5.021981849113405e+02 -4.737027333003314e+02 2.930916299308989e+02 + ME 1.985728923505389e+00 + +Event 235 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.462818585976497e+02 -2.802589959482593e+02 6.506152021861775e+02 + 3 7.499999999999997e+02 2.462818585976497e+02 2.802589959482593e+02 -6.506152021861776e+02 + ME 4.575826699951948e+00 + +Event 236 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -7.345429587235462e+02 -4.314359541629803e+01 -1.452076856237854e+02 + 3 7.500000000000018e+02 7.345429587235461e+02 4.314359541629804e+01 1.452076856237852e+02 + ME 1.997103115301681e+00 + +Event 237 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -2.154628275024254e+02 5.951093556116026e+02 -4.023936192687488e+02 + 3 7.499999999999998e+02 2.154628275024254e+02 -5.951093556116025e+02 4.023936192687490e+02 + ME 2.055826393864322e+00 + +Event 238 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -5.866932570143744e+01 -7.232769026327444e+02 -1.895479684399756e+02 + 3 7.499999999999997e+02 5.866932570143744e+01 7.232769026327443e+02 1.895479684399756e+02 + ME 1.990015734098380e+00 + +Event 239 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -6.560694650821692e+02 1.084032761720732e+02 -3.468740213708119e+02 + 3 7.499999999999998e+02 6.560694650821692e+02 -1.084032761720729e+02 3.468740213708119e+02 + ME 2.003985174864813e+00 + +Event 240 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -4.455880632469494e+02 -5.785873576156609e+02 1.708448053034081e+02 + 3 7.499999999999998e+02 4.455880632469495e+02 5.785873576156611e+02 -1.708448053034081e+02 + ME 1.992937128503243e+00 + +Event 241 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.940553859063193e+02 5.948775697686276e+02 3.495026566503573e+02 + 3 7.499999999999999e+02 -2.940553859063192e+02 -5.948775697686275e+02 -3.495026566503571e+02 + ME 2.005528586269665e+00 + +Event 242 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.255013042950073e+02 3.893831803342041e+02 1.401030235150752e+02 + 3 7.500000000000005e+02 6.255013042950073e+02 -3.893831803342039e+02 -1.401030235150752e+02 + ME 1.997925520851328e+00 + +Event 243 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.768430196426253e+02 -8.702513188290250e+01 4.713584168234838e+02 + 3 7.500000000000002e+02 5.768430196426256e+02 8.702513188290267e+01 -4.713584168234838e+02 + ME 2.209730179651860e+00 + +Event 244 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.570848048302291e+02 6.678520603231962e+02 -3.029818305006274e+02 + 3 7.499999999999999e+02 1.570848048302291e+02 -6.678520603231962e+02 3.029818305006275e+02 + ME 1.987492291155503e+00 + +Event 245 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999959e+02 -3.619595558326033e+02 -3.635357995317544e+02 5.471078526216538e+02 + 3 7.500000000000020e+02 3.619595558326021e+02 3.635357995317536e+02 -5.471078526216538e+02 + ME 2.624749251730370e+00 + +Event 246 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.171130913584885e+02 6.858303858260187e+01 -6.195264609652618e+02 + 3 7.500000000000000e+02 -4.171130913584882e+02 -6.858303858260200e+01 6.195264609652623e+02 + ME 3.652313954526031e+00 + +Event 247 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 2.343959954156197e+02 3.550474070294902e+02 6.176567461744072e+02 + 3 7.499999999999997e+02 -2.343959954156194e+02 -3.550474070294902e+02 -6.176567461744072e+02 + ME 3.610306343463637e+00 + +Event 248 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 5.858635267134600e+02 3.455507392677969e+02 -3.160041370905555e+02 + 3 7.499999999999992e+02 -5.858635267134599e+02 -3.455507392677966e+02 3.160041370905555e+02 + ME 1.990778004173873e+00 + +Event 249 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 7.056780166938408e+02 2.349912780195325e+02 -9.642425011268841e+01 + 3 7.500000000000002e+02 -7.056780166938408e+02 -2.349912780195324e+02 9.642425011268844e+01 + ME 2.004381269152324e+00 + +Event 250 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.004573993267173e+02 5.235762666244966e+01 7.413952993179738e+02 + 3 7.500000000000002e+02 -1.004573993267173e+02 -5.235762666244980e+01 -7.413952993179738e+02 + ME 1.940169038103634e+01 + +Event 251 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000024e+02 -6.475270246624093e+02 1.104657610733046e+02 -3.619476038908663e+02 + 3 7.499999999999998e+02 6.475270246624115e+02 -1.104657610733052e+02 3.619476038908668e+02 + ME 2.013889007952173e+00 + +Event 252 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.030430118392510e+02 3.645899530085105e+02 4.201450873269225e+02 + 3 7.500000000000001e+02 5.030430118392510e+02 -3.645899530085106e+02 -4.201450873269225e+02 + ME 2.083415027886701e+00 + +Event 253 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000009e+02 -5.256865986862222e+02 -4.330514916902604e+02 3.140382230024754e+02 + 3 7.500000000000000e+02 5.256865986862221e+02 4.330514916902599e+02 -3.140382230024749e+02 + ME 1.990205740204019e+00 + +Event 254 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000186e+02 9.702918632485427e+01 6.245997683844753e+02 -4.036836215839905e+02 + 3 7.499999999999869e+02 -9.702918632485657e+01 -6.245997683844794e+02 4.036836215839967e+02 + ME 2.057613877988064e+00 + +Event 255 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -5.376163199361313e+01 7.337003421695744e+02 1.459229071312400e+02 + 3 7.500000000000000e+02 5.376163199361314e+01 -7.337003421695744e+02 -1.459229071312400e+02 + ME 1.996987343463174e+00 + From 32cd140e0c29222d44dffbcadd2ff5bbbeb011b5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 27 Feb 2024 14:21:19 +0100 Subject: [PATCH 26/96] [susy2] in susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc, fix clang formatting --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc index 7de0b3ed3a..878c3e7a14 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc @@ -40,9 +40,9 @@ Parameters_MSSM_SLHA2::getInstance() void Parameters_MSSM_SLHA2::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_Wsl6 = slha.get_block_entry( "decay", 2000015, 2.699061e-01 ); mdl_Wsl5 = slha.get_block_entry( "decay", 2000013, 2.161216e-01 ); mdl_Wsl4 = slha.get_block_entry( "decay", 2000011, 2.161216e-01 ); From 812b9a646a2a3dca20a3ce79b7916bc0d0bc3c36 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 27 Feb 2024 14:19:13 +0100 Subject: [PATCH 27/96] [susy2] in CODEGEN for Parameters.cc, declare "indices" that was previously commented out (this is needed for susy_gg_tt) --- .../iolibs/template_files/cpp_model_parameters_cc.inc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc index 54ce4c64cf..125d4ac4fb 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc @@ -38,9 +38,9 @@ Parameters_%(model_name)s::getInstance() void Parameters_%(model_name)s::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices %(set_independent_parameters)s } From 3a6dd0a41338d805a25a69b3a070795dff6802e8 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 27 Feb 2024 16:00:38 +0100 Subject: [PATCH 28/96] [susy2] in susyggtt.sa Parameters.h, go back to fptype[_sv] for mdl_G__exp__2 instead of cxtype[_sv], because this really is an fptype and not a cxtype --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index c54b16fe5c..078e5385a7 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -840,7 +840,7 @@ namespace mg5amcCpu // Model parameters dependent on aS //const fptype_sv mdl_sqrt__aS = constexpr_sqrt( aS ); //const fptype_sv G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - const cxtype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); + const fptype_sv mdl_G__exp__2 = ( ( G ) * ( G ) ); // Model couplings dependent on aS out.GC_6 = -G; out.GC_51 = -( cI * G * mdl_I51x11 ); @@ -859,7 +859,7 @@ namespace mg5amcCpu // Model parameters dependent on aS //const fptype mdl_sqrt__aS = constexpr_sqrt( aS ); //const fptype G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); - const cxtype mdl_G__exp__2 = ( ( G ) * ( G ) ); + const fptype mdl_G__exp__2 = ( ( G ) * ( G ) ); // Model couplings dependent on aS const cxtype GC_6 = -G; const cxtype GC_51 = -( cI * G * mdl_I51x11 ); From 464e8969104569f4bf5d7852d0d708cb202c4828 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 27 Feb 2024 14:51:02 +0100 Subject: [PATCH 29/96] [susy2] in susyggtt.sa, various improvements to resync with latest CODEGEN Parameters.h, go back to fptype[_sv] for mdl_G__exp__2 instead of cxtype[_sv], because this really is an fpt> --- .../src/Parameters_MSSM_SLHA2.cc | 1 + .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 23 ++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc index 878c3e7a14..fe3cec4f0f 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc @@ -806,6 +806,7 @@ Parameters_MSSM_SLHA2::setIndependentParameters( SLHAReader& slha ) mdl_vd = mdl_vev * mdl_cos__beta; mdl_vu = mdl_vev * mdl_sin__beta; mdl_ee__exp__2 = ( ( mdl_ee ) * ( mdl_ee ) ); + // Fixes for Majorana particles if( mdl_Mneu2 < 0 ) mdl_Wneu2 = -abs( mdl_Wneu2 ); if( mdl_Mneu3 < 0 ) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 078e5385a7..d73484fe41 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -22,8 +22,8 @@ //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) -//#error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1" +#ifndef MGONGPU_HARDCODE_PARAM +//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=0 builds (#439 and PR #625) #include "read_slha.h" @@ -90,6 +90,7 @@ namespace mg5amcCpu } // end namespace mg5amcGpu/mg5amcCpu #else +//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=1 builds (#439 and PR #625) #include #include @@ -756,21 +757,21 @@ namespace mg5amcCpu constexpr double mdl_vu = mdl_vev * mdl_sin__beta; constexpr double mdl_ee__exp__2 = ( ( mdl_ee ) * ( mdl_ee ) ); // Fixes for Majorana particles - constexpr int mdl_Wneu2_sign = ( mdl_Mneu2 < 0 ? -1 : + 1 ); - constexpr int mdl_Wneu3_sign = ( mdl_Mneu3 < 0 ? -1 : + 1 ); - constexpr int mdl_Wneu4_sign = ( mdl_Mneu4 < 0 ? -1 : + 1 ); - constexpr int mdl_Wgo_sign = ( mdl_Mgo < 0 ? -1 : + 1 ); + constexpr int mdl_Wneu2_sign = ( mdl_Mneu2 < 0 ? -1 : +1 ); constexpr double mdl_Wneu2 = mdl_Wneu2_sign * mdl_Wneu2_abs; + constexpr int mdl_Wneu3_sign = ( mdl_Mneu3 < 0 ? -1 : +1 ); constexpr double mdl_Wneu3 = mdl_Wneu3_sign * mdl_Wneu3_abs; + constexpr int mdl_Wneu4_sign = ( mdl_Mneu4 < 0 ? -1 : +1 ); constexpr double mdl_Wneu4 = mdl_Wneu4_sign * mdl_Wneu4_abs; + constexpr int mdl_Wgo_sign = ( mdl_Mgo < 0 ? -1 : +1 ); constexpr double mdl_Wgo = mdl_Wgo_sign * mdl_Wgo_abs; // Model couplings independent of aS // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr cxsmpl mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -814,9 +815,9 @@ namespace mg5amcCpu cxtype_sv GC_51; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> From b07d5978c221c97313b1b91b47da8f1d75bba8b1 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 27 Feb 2024 12:26:53 +0100 Subject: [PATCH 30/96] [susy2] in CODEGEN, almost complete the backport from susy_gg_tt.sa (still missing: need to add a third parameter to cIPC and related variables) If I regenerate the code and diff to what it should be, this is what I get ______________________________________________________________________________ git diff /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc /tmp/git-blob-b4Knci/CPPProcess.cc 0651b576632e9d50c3ed2bf9461bd41309c90821 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc 0000000000000000000000000000000000000000 100644 79c79 < __device__ const fptype cIPD[3] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT, (fptype)Parameters_MSSM_SLHA2::mdl_I51x11 }; --- > __device__ const fptype cIPD[2] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT }; 83c83 < __device__ __constant__ fptype cIPD[3]; --- > __device__ __constant__ fptype cIPD[2]; 86c86 < static fptype cIPD[3]; --- > static fptype cIPD[2]; 505c505 < const fptype tIPD[3] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT, (fptype)m_pars->mdl_I51x11 }; --- > const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; 508c508 < gpuMemcpyToSymbol( cIPD, tIPD, 3 * sizeof( fptype ) ); --- > gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); 511c511 < memcpy( cIPD, tIPD, 3 * sizeof( fptype ) ); --- > memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); 514c514 < //for ( i=0; i<3; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; --- > //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; ______________________________________________________________________________ git diff /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h /tmp/git-blob-OpNqx8/Parameters_MSSM_SLHA2.h 68d2e3d1bd385d2728480d8e1d1efe8b18563c04 100644 epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h 0000000000000000000000000000000000000000 100644 830,831c830 < //const double mdl_I51x11 = Parameters_MSSM_SLHA2::getInstance()->mdl_I51x11; // fix HRDCOD=0 susy builds < const fptype mdl_I51x11 = cIPD[2]; // fix HRDCOD=0 susy builds --- > // ??? --- .../template_files/cpp_model_parameters_h.inc | 18 +++++---- .../gpu/process_function_definitions.inc | 4 +- .../CUDACPP_SA_OUTPUT/model_handling.py | 37 ++++++++++++++----- 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index 94b8dd6444..59aa75bd41 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -20,7 +20,7 @@ //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439)%(efterror)s +#ifndef MGONGPU_HARDCODE_PARAM%(eftwarn0)s #include "read_slha.h" @@ -79,7 +79,7 @@ namespace mg5amcCpu } // end namespace mg5amcGpu/mg5amcCpu -#else +#else%(eftwarn1)s #include #include @@ -170,16 +170,19 @@ namespace mg5amcCpu %(dcoupdecl)s }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* cIPD ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_%(model_name)s; +#else +%(eftspecial0)s #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_%(model_name)s) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -220,12 +223,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* cIPD ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_%(model_name)s_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, cIPD ); %(dcoupaccessbuffer)s%(dcoupkernelaccess)s%(dcoupcompute)s mgDebug( 1, __FUNCTION__ ); return; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 9d024183db..3848d35d40 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -230,7 +230,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, cIPD ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -240,7 +240,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, cIPD ); } #endif } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index b75e8a3eaf..a904954046 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -706,6 +706,11 @@ def write_set_parameters(self, params): # AV - new method (merging write_parameters and write_set_parameters) def write_hardcoded_parameters(self, params): + majorana_widths = [] + for particle in self.model.get('particles'): + if particle.is_fermion() and particle.get('self_antipart') and \ + particle.get('width').lower() != 'zero': + majorana_widths.append( particle.get('width') ) ###misc.sprint(params) # for debugging pardef = super().write_parameters(params) parset = self.super_write_set_parameters_donotfixMajorana(params) @@ -736,7 +741,10 @@ def write_hardcoded_parameters(self, params): type, pars = line.rstrip(';').split(' ') # strip trailing ';' for par in pars.split(','): ###print(len(pardef_lines), par) # for debugging - pardef_lines[par] = ( 'constexpr ' + type + ' ' + par ) + if par in majorana_widths: + pardef_lines[par] = ( 'constexpr ' + type + ' ' + par + "_abs" ) + else: + pardef_lines[par] = ( 'constexpr ' + type + ' ' + par ) ###misc.sprint( 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) ) ###print( pardef_lines ) # for debugging ###for line in pardef_lines: misc.sprint(line) # for debugging @@ -774,9 +782,9 @@ def write_hardcoded_parameters(self, params): def super_write_set_parameters_donotfixMajorana(self, params): """Write out the lines of independent parameters""" res_strings = [] - # For each parameter, write name = expr; + # For each parameter, write "name = expr;" for param in params: - res_strings.append("%s" % param.expr) + res_strings.append( "%s" % param.expr ) return "\n".join(res_strings) # AV - replace export_cpp.UFOModelConverterCPP method (eventually split writing of parameters and fixes for Majorana particles #622) @@ -789,10 +797,15 @@ def super_write_set_parameters_onlyfixMajorana(self, hardcoded): # FIXME! split for particle in self.model.get('particles'): if particle.is_fermion() and particle.get('self_antipart') and \ particle.get('width').lower() != 'zero': - res_strings.append( prefix+" if( %s < 0 )" % particle.get('mass')) - res_strings.append( prefix+" %(width)s = -abs( %(width)s );" % {"width": particle.get('width')}) + if hardcoded: + res_strings.append( prefix+" constexpr int %s_sign = ( %s < 0 ? -1 : +1 );" % ( particle.get('width'), particle.get('mass') ) ) + res_strings.append( prefix+" constexpr double %(W)s = %(W)s_sign * %(W)s_abs;" % { 'W' : particle.get('width') } ) + else: + res_strings.append( prefix+" if( %s < 0 )" % particle.get('mass')) + res_strings.append( prefix+" %(width)s = -abs( %(width)s );" % {"width": particle.get('width')}) + if len( res_strings ) != 0 : res_strings = [ prefix + " // Fixes for Majorana particles" ] + res_strings if not hardcoded: return '\n' + '\n'.join(res_strings) if res_strings else '' - else: return '\n'.join(res_strings) + else: return '\n'.join(res_strings) + '\n' # AV - replace export_cpp.UFOModelConverterCPP method (add hardcoded parameters and couplings) def super_generate_parameters_class_files(self): @@ -833,7 +846,7 @@ def super_generate_parameters_class_files(self): ###misc.sprint(self.coups_indep) # for debugging replace_dict['hardcoded_independent_couplings'] = self.write_hardcoded_parameters(self.coups_indep) ###misc.sprint(self.params_dep) # for debugging - hrd_params_dep = [ line.replace('constexpr','//constexpr') + ' // now computed event-by-event (running alphas #373)' if line != '' else line for line in self.write_hardcoded_parameters(self.params_dep).split('\n') ] + hrd_params_dep = [ line.replace('constexpr ','//constexpr ') + ' // now computed event-by-event (running alphas #373)' if line != '' else line for line in self.write_hardcoded_parameters(self.params_dep).split('\n') ] replace_dict['hardcoded_dependent_parameters'] = '\n'.join( hrd_params_dep ) ###misc.sprint(self.coups_dep) # for debugging hrd_coups_dep = [ line.replace('constexpr','//constexpr') + ' // now computed event-by-event (running alphas #373)' if line != '' else line for line in self.write_hardcoded_parameters(list(self.coups_dep.values())).split('\n') ] @@ -854,7 +867,7 @@ def super_generate_parameters_class_files(self): foundG = False for line in self.write_hardcoded_parameters(self.params_dep).split('\n'): if line != '': - dcoupsetdpar.append( ' ' + line.replace('constexpr double', 'const fptype_sv' if foundG else '//const fptype_sv' ) ) + dcoupsetdpar.append( ' ' + line.replace('constexpr cxsmpl mdl_G__exp__2','const fptype_sv mdl_G__exp__2').replace('constexpr double', 'const fptype_sv' if foundG else '//const fptype_sv' ) ) if 'constexpr double G =' in line: foundG = True replace_dict['dcoupsetdpar'] = ' ' + '\n'.join( dcoupsetdpar ) dcoupsetdcoup = [ ' ' + line.replace('constexpr cxsmpl ','out.').replace('mdl_complexi', 'cI') for line in self.write_hardcoded_parameters(list(self.coups_dep.values())).split('\n') if line != '' ] @@ -889,11 +902,15 @@ def super_generate_parameters_class_files(self): replace_dict['dcoupoutdcoup2'] = '' # Require HRDCOD=1 in EFT and special handling in EFT for fptype=float using SIMD if self.model_name[:2] == 'sm' : - replace_dict['efterror'] = '' + replace_dict['eftwarn0'] = '' + replace_dict['eftwarn1'] = '' + replace_dict['eftspecial0'] = ' // SM implementation - no special handling of non-hardcoded parameters (PR #625)' replace_dict['eftspecial1'] = ' // Begin SM implementation - no special handling of vectors of floats as in EFT (#439)' replace_dict['eftspecial2'] = ' // End SM implementation - no special handling of vectors of floats as in EFT (#439)' else: - replace_dict['efterror'] = '\n// WARNING! Support for non-SM physics processes is still limited (see PR #625)\n//#error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1"' + replace_dict['eftwarn0'] = '\n//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=0 builds (#439 and PR #625)' + replace_dict['eftwarn1'] = '\n//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=1 builds (#439 and PR #625)' + replace_dict['eftspecial0'] = ' // ???' replace_dict['eftspecial1'] = ' // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)' replace_dict['eftspecial1'] += '\n#if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT )' replace_dict['eftspecial2'] = """#else From e3fcb60f7708fd7be55e228991f694b840b470d6 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 27 Feb 2024 19:00:23 +0100 Subject: [PATCH 31/96] [susy2] progress in CODEGEN: implement a mechanism to identify and print out the relevant independent parameters Now the .h file difference is the following. I will move the cIPD handling to an additional parallel mechanism in Parameters.h. ______________________________________________________________________________ git diff /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h /tmp/git-blob-J5f0OS/Parameters_MSSM_SLHA2.h 68d2e3d1bd385d2728480d8e1d1efe8b18563c04 100644 epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h 0000000000000000000000000000000000000000 100644 830,831c830 < //const double mdl_I51x11 = Parameters_MSSM_SLHA2::getInstance()->mdl_I51x11; // fix HRDCOD=0 susy builds < const fptype mdl_I51x11 = cIPD[2]; // fix HRDCOD=0 susy builds --- > const fptype mdl_I51x11; --- .../PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index a904954046..74d9c35bd8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -900,6 +900,18 @@ def super_generate_parameters_class_files(self): replace_dict['dcoupsetdpar2'] = ' // (none)' replace_dict['dcoupsetdcoup2'] = ' // (none)' replace_dict['dcoupoutdcoup2'] = '' + # Identify which extra independent parameters must be made available through CPU static and GPU constant memory in BSM/EFT/SUSY models + # because they are used in the event by event calculation of alphaS-dependent couplings + # WARNING! This is only implemented and has only been tested so far for real parameters (complex parameters need twice the storage) + if self.model_name[:2] != 'sm' : + param_indep_real_used = [] + for param in self.params_indep: + if param.type == 'real': + for coup in self.coups_dep.values(): + if param.name in coup.expr: + param_indep_real_used.append( param.name ) + param_indep_real_used = set( param_indep_real_used ) + misc.sprint('PIPPO!', param_indep_real_used ) # Require HRDCOD=1 in EFT and special handling in EFT for fptype=float using SIMD if self.model_name[:2] == 'sm' : replace_dict['eftwarn0'] = '' @@ -910,7 +922,10 @@ def super_generate_parameters_class_files(self): else: replace_dict['eftwarn0'] = '\n//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=0 builds (#439 and PR #625)' replace_dict['eftwarn1'] = '\n//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=1 builds (#439 and PR #625)' - replace_dict['eftspecial0'] = ' // ???' + if len( param_indep_real_used ) == 0: + replace_dict['eftspecial0'] = ' // No additional parameters needed in constant memory for this BSM model' + else: + replace_dict['eftspecial0'] = '\n'.join( ' const fptype %s;' % param for param in param_indep_real_used ) replace_dict['eftspecial1'] = ' // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)' replace_dict['eftspecial1'] += '\n#if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT )' replace_dict['eftspecial2'] = """#else From ddbe2405aa7aa9f35f819a02243a2d6c3acdf531 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 28 Feb 2024 13:46:19 +0100 Subject: [PATCH 32/96] [susy2] in susyggtt.sa, first implementation of constexpr trig functions for HRDCOD=1 cuda builds (#627) Builds still fail because assert is not constexpr (I need to extend the function to cover any value and not just 0 to pi/2). Also missing is atan. --- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 43 +----- .../susy_gg_tt.sa/src/constexpr_math.h | 130 ++++++++++++++++++ 2 files changed, 137 insertions(+), 36 deletions(-) create mode 100644 epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index d73484fe41..743d4ef530 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -20,6 +20,8 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== #ifndef MGONGPU_HARDCODE_PARAM @@ -105,37 +107,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_MSSM_SLHA2 // keep the same name rather than HardcodedParameters_MSSM_SLHA2 for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -396,11 +367,11 @@ namespace mg5amcCpu constexpr cxsmpl mdl_ye3x3 = mdl_Rye3x3; constexpr cxsmpl mdl_yu3x3 = mdl_Ryu3x3; constexpr double mdl_MZ__exp__2 = ( ( mdl_MZ ) * ( mdl_MZ ) ); - constexpr cxsmpl mdl_bb = ( ( -mdl_mHd2 + mdl_mHu2 - mdl_MZ__exp__2 * cos( 2. * mdl_beta ) ) * tan( 2. * mdl_beta ) ) / 2.; + constexpr cxsmpl mdl_bb = ( ( -mdl_mHd2 + mdl_mHu2 - mdl_MZ__exp__2 * constexpr_cos( 2. * mdl_beta ) ) * constexpr_tan( 2. * mdl_beta ) ) / 2.; constexpr double mdl_cw__exp__2 = ( ( mdl_cw ) * ( mdl_cw ) ); constexpr double mdl_sw = constexpr_sqrt( 1. - mdl_cw__exp__2 ); - constexpr double mdl_cos__beta = cos( mdl_beta ); - constexpr double mdl_sin__beta = sin( mdl_beta ); + constexpr double mdl_cos__beta = constexpr_cos( mdl_beta ); + constexpr double mdl_sin__beta = constexpr_sin( mdl_beta ); constexpr cxsmpl mdl_conjg__yu3x3 = conj( mdl_yu3x3 ); constexpr cxsmpl mdl_I1x33 = mdl_conjg__CKM3x3 * mdl_conjg__yu3x3; constexpr cxsmpl mdl_conjg__yd3x3 = conj( mdl_yd3x3 ); @@ -746,8 +717,8 @@ namespace mg5amcCpu constexpr cxsmpl mdl_conjg__VV1x2 = conj( mdl_VV1x2 ); constexpr cxsmpl mdl_conjg__VV2x1 = conj( mdl_VV2x1 ); constexpr cxsmpl mdl_conjg__VV2x2 = conj( mdl_VV2x2 ); - constexpr double mdl_cos__alp = cos( mdl_alp ); - constexpr double mdl_sin__alp = sin( mdl_alp ); + constexpr double mdl_cos__alp = constexpr_cos( mdl_alp ); + constexpr double mdl_sin__alp = constexpr_sin( mdl_alp ); constexpr cxsmpl mdl_conjg__MUH = conj( mdl_MUH ); constexpr double mdl_ee = 2. * constexpr_sqrt( 1. / aEWM1 ) * constexpr_sqrt( M_PI ); constexpr double mdl_gp = mdl_ee / mdl_cw; diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h new file mode 100644 index 0000000000..3a6f67ae0f --- /dev/null +++ b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h @@ -0,0 +1,130 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double x, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double x ) + { + return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( x, x, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double d ) + { + const int i = static_cast( d ); + return d < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of cos + // Taylor expansion : 1 - x**2/2! + x**4/4! + constexpr long double cosTaylor( const long double x ) + { + long double cosx = 0; + int ipow = 0; + long double delta = 1; + while( true ) + { + long double cosxlast = cosx; + cosx += delta; + //std::cout << "ipow=" << ipow << ", delta=" << delta << ", cosx=" << cosx <= 0 && "The argument of constexpr_sin must be between 0 and pi/2" ); + assert( x <= constexpr_pi_by_2 && "The argument of constexpr_sin must be between 0 and pi/2" ); + if ( x < constexpr_pi_by_4 ) return sinTaylor( (long double)x ); + else return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - (long double)x ), 2 ) ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + assert( x >= 0 && "The argument of constexpr_sin must be between 0 and pi/2" ); + assert( x <= constexpr_pi_by_2 && "The argument of constexpr_sin must be between 0 and pi/2" ); + if ( x < constexpr_pi_by_4 ) + return sinTaylor( (long double)x ) / constexpr_sqrt( 1 - constexpr_pow( sinTaylor( (long double)x ), 2 ) ); + else + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - (long double)x ), 2 ) ) / sinTaylor( constexpr_pi_by_2 - (long double)x ); + } +} + +#endif // constexpr_math_h From 5aca39012f0fd59e3b88a3279dcfcbc5c97709ab Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 28 Feb 2024 17:08:13 +0100 Subject: [PATCH 33/96] [susy2] in susyggtt.sa, complete(?) the implementation of constexpr trig functions for HRDCOD=1 cuda builds (#627) Extend coverage of sin/cos/tan to any value, and add an atan function. Now the build fails with the following: HRDCOD=1 make ccache /usr/local/cuda-12.0/bin/nvcc -Xcompiler -O3 -lineinfo -I. -I../../src -I/usr/local/cuda-12.0/include/ -DUSE_NVTX -gencode arch=compute_70,code=compute_70 -gencode arch=compute_70,code=sm_70 -use_fast_math -std=c++17 -ccbin /usr/lib64/ccache/g++ -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -DMGONGPU_HARDCODE_PARAM -Xcompiler -fPIC -c -x cu CPPProcess.cc -o CPPProcess_cu.o ../../src/Parameters_MSSM_SLHA2.h(818): error: identifier "mg5amcGpu::Parameters_MSSM_SLHA2::mdl_I51x11" is undefined in device code --- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 2 +- .../susy_gg_tt.sa/src/constexpr_math.h | 167 +++++++++++++----- 2 files changed, 128 insertions(+), 41 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 743d4ef530..c57c6940de 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -302,7 +302,7 @@ namespace mg5amcCpu constexpr double mdl_I101x44 = 1.; constexpr double mdl_I100x44 = 1.; constexpr double mdl_CKM3x3 = 1.; - constexpr double mdl_atan__tb = atan( mdl_tb ); + constexpr double mdl_atan__tb = constexpr_atan( mdl_tb ); constexpr double mdl_beta = mdl_atan__tb; constexpr double mdl_cw = mdl_MW / mdl_MZ; constexpr cxsmpl mdl_mD21x1 = mdl_RmD21x1; diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h index 3a6f67ae0f..0179729379 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h @@ -12,6 +12,13 @@ #include #include +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu @@ -55,30 +62,12 @@ namespace mg5amcCpu constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 - // Constexpr implementation of cos - // Taylor expansion : 1 - x**2/2! + x**4/4! - constexpr long double cosTaylor( const long double x ) - { - long double cosx = 0; - int ipow = 0; - long double delta = 1; - while( true ) - { - long double cosxlast = cosx; - cosx += delta; - //std::cout << "ipow=" << ipow << ", delta=" << delta << ", cosx=" << cosx <= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( x < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); long double sinx = 0; int ipow = 1; long double delta = x; @@ -86,7 +75,9 @@ namespace mg5amcCpu { long double sinxlast = sinx; sinx += delta; - //std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx <= 0 && "The argument of constexpr_cos must be between 0 and pi/2" ); - assert( x <= constexpr_pi_by_2 && "The argument of constexpr_cos must be between 0 and pi/2" ); - //if ( x < constexpr_pi_by_4 ) return cosTaylor( (long double)x ); - //else return constexpr_sqrt( 1 - constexpr_pow( cosTaylor( constexpr_pi_by_2 - (long double)x ), 2 ) ); - if ( x < constexpr_pi_by_4 ) return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( (long double)x ), 2 ) ); - else return sinTaylor( constexpr_pi_by_2 - (long double)x ); + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double x, const bool assume0toPi = false ) + { + if( assume0toPi ) + { + assert( x >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,pi)" ); + assert( x < constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,pi)" ); + } + if( x < 0 ) + return constexpr_sin_quad( x + ( constexpr_floor( -x / constexpr_pi ) + 1 ) * constexpr_pi, true ); + else if( x < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( (long double)x ); + else if( x < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - (long double)x ), 2 ) ); + else if( x < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( (long double)x - constexpr_pi_by_2 ), 2 ) ); + else if( x < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - (long double)x ); + else if( x < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - (long double)x, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( x - constexpr_floor( x / constexpr_pi ) * constexpr_pi, true ); } // Constexpr implementation of sin (double signature, internally implemented as long double) constexpr double constexpr_sin( const double x ) { - assert( x >= 0 && "The argument of constexpr_sin must be between 0 and pi/2" ); - assert( x <= constexpr_pi_by_2 && "The argument of constexpr_sin must be between 0 and pi/2" ); - if ( x < constexpr_pi_by_4 ) return sinTaylor( (long double)x ); - else return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - (long double)x ), 2 ) ); + return constexpr_sin_quad( x ); } - // Constexpr implementation of sin (double signature, internally implemented as long double) + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double x, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( x >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( x < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( x < 0 ) + return constexpr_tan_quad( x + ( constexpr_floor( -x / constexpr_pi ) + 1 ) * constexpr_pi, true ); + else if( x < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( x, assume0to2Pi ) / constexpr_cos_quad( x, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( x - constexpr_floor( x / constexpr_pi ) * constexpr_pi, true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) constexpr double constexpr_tan( const double x ) { - assert( x >= 0 && "The argument of constexpr_sin must be between 0 and pi/2" ); - assert( x <= constexpr_pi_by_2 && "The argument of constexpr_sin must be between 0 and pi/2" ); - if ( x < constexpr_pi_by_4 ) - return sinTaylor( (long double)x ) / constexpr_sqrt( 1 - constexpr_pow( sinTaylor( (long double)x ), 2 ) ); - else - return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - (long double)x ), 2 ) ) / sinTaylor( constexpr_pi_by_2 - (long double)x ); + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( x < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = x; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx < 1 ) + return atanTaylor( 1 / (long double)x ); + else if( x == 1 ) + return constexpr_pi_by_4; + else if( x > -1 ) + return atanTaylor( (long double)x ); + else if( x == -1 ) + return -constexpr_pi_by_4; + else if( x < -1 ) + return atanTaylor( 1 / (long double)x ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); } } From d374ede66c22cac01c77f1f6a335956408615cd9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 28 Feb 2024 17:12:44 +0100 Subject: [PATCH 34/96] [susy2] in susyggtt.sa, fix the last pending issue for HRDCOD=1 cuda builds (#627): it was enought to define mdl_I51x11 as a "__device__" constexpr. NB: Now the build fully succeeds and runTest.exe also succeeds for HRDCOD=1! --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index c57c6940de..2805ef7d81 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -292,7 +292,7 @@ namespace mg5amcCpu constexpr double mdl_I5x11 = 1.; constexpr double mdl_I53x11 = 1.; constexpr double mdl_I52x44 = 1.; - constexpr double mdl_I51x11 = 1.; + __device__ constexpr double mdl_I51x11 = 1.; constexpr double mdl_I39x11 = 1.; constexpr double mdl_I31x11 = 1.; constexpr double mdl_I26x44 = 1.; From ddedfb748b4785db302919c6327f429c7f3404c8 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 28 Feb 2024 17:27:20 +0100 Subject: [PATCH 35/96] [susy2] in susyggtt.sa testmisc.cc, add the basic test I had used for constexpr trig functions - I will clean it up --- .../susy_gg_tt.sa/SubProcesses/testmisc.cc | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc index ac0b049e60..38fc06dd23 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc @@ -10,6 +10,7 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include @@ -295,4 +296,44 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + std::cout << std::scientific << std::setprecision( 20 ); + + // Test constexpr sin, cos, tan + { + const int nstep = 8; + for ( int istep = 0; istep < nstep + 1; istep++ ) + { + //long double x0 = 0, x1 = constexpr_pi_by_2; + //long double x0 = constexpr_pi_by_2 - 1E-15, x1 = constexpr_pi_by_2; // this is at the limit of precision for "double x" + //long double x0 = 0, x1 = 2 * constexpr_pi; + long double x0 = 0+0.1, x1 = 2 * constexpr_pi+0.1; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + std::cout << std::endl; + std::cout << "x = " << x << std::endl; + std::cout << "sin = " << std::sin( x ) << std::endl; + std::cout << "constexpr_sin = " << constexpr_sin( x ) << std::endl; + std::cout << "cos = " << std::cos( x ) << std::endl; + std::cout << "constexpr_cos = " << constexpr_cos( x ) << std::endl; + std::cout << "tan = " << std::tan( x ) << std::endl; + std::cout << "constexpr_tan = " << constexpr_tan( x ) << std::endl; + } + } + + // Test constexpr atan + { + const int nstep = 40; + for ( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -2, x1 = +2; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + std::cout << std::endl; + std::cout << "x = " << x << std::endl; + std::cout << "atan = " << std::tan( x ) << std::endl; + std::cout << "constexpr_atan = " << constexpr_tan( x ) << std::endl; + } + } + + //-------------------------------------------------------------------------- + } From 07281171273f031578a14ba29bbfeb270f7027a9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 11:29:16 +0100 Subject: [PATCH 36/96] [susy2] in susyggtt.sa constexpr_math.h, many fixes in sin/cos/tan after more intense testing --- .../susy_gg_tt.sa/src/constexpr_math.h | 142 +++++++++--------- 1 file changed, 74 insertions(+), 68 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h index 0179729379..14677f6bda 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h @@ -27,22 +27,22 @@ namespace mg5amcCpu #endif { // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - constexpr long double sqrtNewtonRaphson( const long double x, const long double curr, const long double prev ) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); } - constexpr long double constexpr_sqrt( const long double x ) + constexpr long double constexpr_sqrt( const long double xx ) { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) : std::numeric_limits::quiet_NaN(); } // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( const long double d ) + constexpr int constexpr_floor( const long double xx ) { - const int i = static_cast( d ); - return d < i ? i - 1 : i; + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; } // Constexpr implementation of pow @@ -64,13 +64,13 @@ namespace mg5amcCpu // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); - assert( x < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx >= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); long double sinx = 0; int ipow = 1; - long double delta = x; + long double delta = xx; while( true ) { long double sinxlast = sinx; @@ -81,33 +81,39 @@ namespace mg5amcCpu if ( sinx == sinxlast ) break; // Next iteration ipow += 2; - delta *= -x * x / ( ipow - 1 ) / ipow; + delta *= -xx * xx / ( ipow - 1 ) / ipow; } return sinx; } + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + // Constexpr implementation of cos (long double signature) - constexpr long double constexpr_cos_quad( const long double x, const bool assume0to2Pi = false ) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) { if( assume0to2Pi ) { - assert( x >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); - assert( x < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); } - if( x < 0 ) - return constexpr_cos_quad( x + ( constexpr_floor( -x / constexpr_pi ) + 1 ) * constexpr_pi, true ); - else if( x < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) - return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( (long double)x ), 2 ) ); - else if( x < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) - return sinTaylor( constexpr_pi_by_2 - (long double)x ); - else if( x < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) - return -sinTaylor( (long double)x - constexpr_pi_by_2 ); - else if( x < constexpr_pi ) // [3/4*pi, 4/4*pi) - return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - (long double)x ), 2 ) ); - else if( x < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) - return constexpr_cos_quad( 2 * constexpr_pi - (long double)x, true ); + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); else // [8/4*pi, +inf) - return constexpr_cos_quad( x - constexpr_floor( x / constexpr_pi ) * constexpr_pi, true ); + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); } // Constexpr implementation of cos (double signature, internally implemented as long double) @@ -117,27 +123,27 @@ namespace mg5amcCpu } // Constexpr implementation of sin (long double signature) - constexpr long double constexpr_sin_quad( const long double x, const bool assume0toPi = false ) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) { - if( assume0toPi ) + if( assume0to2Pi ) { - assert( x >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,pi)" ); - assert( x < constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,pi)" ); + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); } - if( x < 0 ) - return constexpr_sin_quad( x + ( constexpr_floor( -x / constexpr_pi ) + 1 ) * constexpr_pi, true ); - else if( x < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) - return sinTaylor( (long double)x ); - else if( x < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) - return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - (long double)x ), 2 ) ); - else if( x < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) - return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( (long double)x - constexpr_pi_by_2 ), 2 ) ); - else if( x < constexpr_pi ) // [3/4*pi, 4/4*pi) - return sinTaylor( constexpr_pi - (long double)x ); - else if( x < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) - return -constexpr_sin_quad( 2 * constexpr_pi - (long double)x, true ); + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); else // [8/4*pi, +inf) - return constexpr_sin_quad( x - constexpr_floor( x / constexpr_pi ) * constexpr_pi, true ); + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); } // Constexpr implementation of sin (double signature, internally implemented as long double) @@ -147,19 +153,19 @@ namespace mg5amcCpu } // Constexpr implementation of tan (long double signature) - constexpr long double constexpr_tan_quad( const long double x, const bool assume0to2Pi = false ) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) { if( assume0to2Pi ) { - assert( x >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); - assert( x < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); } - if( x < 0 ) - return constexpr_tan_quad( x + ( constexpr_floor( -x / constexpr_pi ) + 1 ) * constexpr_pi, true ); - else if( x < 2 * constexpr_pi ) // [0, 2*pi) - return constexpr_sin_quad( x, assume0to2Pi ) / constexpr_cos_quad( x, assume0to2Pi ); + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); else // [8/4*pi, +inf) - return constexpr_tan_quad( x - constexpr_floor( x / constexpr_pi ) * constexpr_pi, true ); + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); } // Constexpr implementation of tan (double signature, internally implemented as long double) @@ -170,13 +176,13 @@ namespace mg5amcCpu // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); - assert( x < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx >= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); long double atanx = 0; int ipow = 1; - long double xpow = x; + long double xpow = xx; while( true ) { long double atanxlast = atanx; @@ -187,24 +193,24 @@ namespace mg5amcCpu if ( atanx == atanxlast ) break; // Next iteration ipow += 2; - xpow *= -x * x; + xpow *= -xx * xx; } return atanx; } // Constexpr implementation of atan (long double signature) - constexpr long double constexpr_atan_quad( const long double x ) + constexpr long double constexpr_atan_quad( const long double xx ) { - if( x > 1 ) - return atanTaylor( 1 / (long double)x ); - else if( x == 1 ) + if( xx > 1 ) + return atanTaylor( 1 / xx ); + else if( xx == 1 ) return constexpr_pi_by_4; - else if( x > -1 ) - return atanTaylor( (long double)x ); - else if( x == -1 ) + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) return -constexpr_pi_by_4; - else if( x < -1 ) - return atanTaylor( 1 / (long double)x ); + else if( xx < -1 ) + return atanTaylor( 1 / xx ); } // Constexpr implementation of atan (double signature, internally implemented as long double) From 9fd060b73954dde223af06b25a2d5e7252196d59 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 09:37:07 +0100 Subject: [PATCH 37/96] [susy2] in susyggtt.sa, add many additional tests for constexpr_math in testmisc.cc - still need to clean up Note: I get here a test failure when x is within ~1E-15 of the horizontal or vertical axis (e.g. when it is equivalent to 3pi/2). ./runTest.exe --gtest_filter=*misc Running main() from /data/avalassi/GPU2023/madgraph4gpuBis/test/googletest/googletest/src/gtest_main.cc Note: Google Test filter = *misc [==========] Running 2 tests from 2 test suites. [----------] Global test environment set-up. [----------] 1 test from SIGMA_MSSM_SLHA2_GG_TTX_CPU_MISC [ RUN ] SIGMA_MSSM_SLHA2_GG_TTX_CPU_MISC.testmisc testSinCosTanX: xx= 4.712388980384687897640105802565813064575 x= 4.712388980384687897640105802565813064575 testSinCosTanX: xx= 4.712388980384687898073786671560014838178 x= 4.712388980384687897640105802565813064575 testSinCosTanX: xx= 4.712388980384687897640105802565813064575 x= 4.712388980384687897640105802565813064575 testmisc.cc:390: Failure The difference between std::cos( x ) and constexpr_cos( x ) is 1.7764320877644403e-15, which exceeds std::abs( std::cos( x ) * tolerance ), where std::cos( x ) evaluates to -1.8369701987210297e-16, constexpr_cos( x ) evaluates to -1.9601291076365435e-15, and std::abs( std::cos( x ) * tolerance ) evaluates to 3.6739403974420597e-19. x=4.712388980384687897640105802565813064575, x(0to2Pi)=4.712388980384687897640105802565813064575, istep=55, distance4=1.960020687419294915798673173412680625916e-15 testmisc.cc:390: Failure The difference between std::cos( x ) and constexpr_cos( x ) is 1.7762314254599337e-15, which exceeds std::abs( std::cos( x ) * tolerance ), where std::cos( x ) evaluates to 3.0616169978683831e-16, constexpr_cos( x ) evaluates to -1.4700697256730955e-15, and std::abs( std::cos( x ) * tolerance ) evaluates to 6.123233995736766e-19. x=7.853981633974484566351748071610927581787, x(0to2Pi)=1.570796326794898089326130152798555172922, istep=65, distance4=1.470069725673095462070705252699553966522e-15 testmisc.cc:390: Failure The difference between std::cos( x ) and constexpr_cos( x ) is 1.7765324189166937e-15, which exceeds std::abs( std::cos( x ) * tolerance ), where std::cos( x ) evaluates to -4.2862637970157361e-16, constexpr_cos( x ) evaluates to -2.2051587986182675e-15, and std::abs( std::cos( x ) * tolerance ) evaluates to 8.5725275940314727e-19. x=10.99557428756427412963603273965418338776, x(0to2Pi)=4.712388980384687652610414820841810978891, istep=75, distance4=2.205700899704510220544761978089809417725e-15 testmisc.cc:390: Failure The difference between std::cos( x ) and constexpr_cos( x ) is 1.7761310943076804e-15, which exceeds std::abs( std::cos( x ) * tolerance ), where std::cos( x ) evaluates to 5.5109105961630896e-16, constexpr_cos( x ) evaluates to -1.2250400346913715e-15, and std::abs( std::cos( x ) * tolerance ) evaluates to 1.1021821192326179e-18. x=14.13716694115407079834767500869929790497, x(0to2Pi)=1.570796326794897844296439171074553087237, istep=85, distance4=1.22504003469137145998502091970294713974e-15 [ FAILED ] SIGMA_MSSM_SLHA2_GG_TTX_CPU_MISC.testmisc (0 ms) [----------] 1 test from SIGMA_MSSM_SLHA2_GG_TTX_CPU_MISC (0 ms total) [----------] 1 test from SIGMA_MSSM_SLHA2_GG_TTX_GPU_MISC [ RUN ] SIGMA_MSSM_SLHA2_GG_TTX_GPU_MISC.testmisc testSinCosTanX: xx= 4.712388980384687897640105802565813064575 x= 4.712388980384687897640105802565813064575 testSinCosTanX: xx= 4.712388980384687898073786671560014838178 x= 4.712388980384687897640105802565813064575 testSinCosTanX: xx= 4.712388980384687897640105802565813064575 x= 4.712388980384687897640105802565813064575 [ OK ] SIGMA_MSSM_SLHA2_GG_TTX_GPU_MISC.testmisc (0 ms) [----------] 1 test from SIGMA_MSSM_SLHA2_GG_TTX_GPU_MISC (0 ms total) [----------] Global test environment tear-down [==========] 2 tests from 2 test suites ran. (0 ms total) [ PASSED ] 1 test. [ FAILED ] 1 test, listed below: [ FAILED ] SIGMA_MSSM_SLHA2_GG_TTX_CPU_MISC.testmisc 1 FAILED TEST --- .../susy_gg_tt.sa/SubProcesses/cudacpp.mk | 3 + .../susy_gg_tt.sa/SubProcesses/testmisc.cc | 113 +++++++++++++++--- 2 files changed, 99 insertions(+), 17 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index 3ad91dfd59..15e90b0d75 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +#$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc index 38fc06dd23..e6eed95cd4 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc @@ -15,6 +15,9 @@ #include +#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -297,29 +300,104 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) //-------------------------------------------------------------------------- - std::cout << std::scientific << std::setprecision( 20 ); + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision(6); // default + }; + //testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + //testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) - // Test constexpr sin, cos, tan + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for ( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) { - const int nstep = 8; + auto toleranceForX = []( const double ) + { + return 2E-3; // NB: tolerance 1E-3 is not enough for cos( x=-7.8539816339744828 ) with 100 steps in [-4*pi,6*pi] + }; for ( int istep = 0; istep < nstep + 1; istep++ ) { - //long double x0 = 0, x1 = constexpr_pi_by_2; - //long double x0 = constexpr_pi_by_2 - 1E-15, x1 = constexpr_pi_by_2; // this is at the limit of precision for "double x" - //long double x0 = 0, x1 = 2 * constexpr_pi; - long double x0 = 0+0.1, x1 = 2 * constexpr_pi+0.1; double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) - std::cout << std::endl; - std::cout << "x = " << x << std::endl; - std::cout << "sin = " << std::sin( x ) << std::endl; - std::cout << "constexpr_sin = " << constexpr_sin( x ) << std::endl; - std::cout << "cos = " << std::cos( x ) << std::endl; - std::cout << "constexpr_cos = " << constexpr_cos( x ) << std::endl; - std::cout << "tan = " << std::tan( x ) << std::endl; - std::cout << "constexpr_tan = " << constexpr_tan( x ) << std::endl; + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + const bool debug = false; + testSinCosTanX( x, tolerance, debug, istep ); // strangely, this succeeds instead! } - } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); + /* // Test constexpr atan { const int nstep = 40; @@ -333,7 +411,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) std::cout << "constexpr_atan = " << constexpr_tan( x ) << std::endl; } } - + */ + //-------------------------------------------------------------------------- } From 052b0731a55f441dbbe6256859cc63a5eadda8cb Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 12:06:47 +0100 Subject: [PATCH 38/96] [susy2] in susyggtt.sa testmisc.cc, fix tolerances to bypass problematic cases (horizontal/vertical axis within a few E-15) --- .../susy_gg_tt.sa/SubProcesses/testmisc.cc | 35 ++++++++++++------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc index e6eed95cd4..c27efd382b 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc @@ -322,8 +322,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) const double x = (double)xx; if( debug ) { - std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; - std::cout << std::setprecision(40) << " x= " << x << std::endl; + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; } //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; //int width = 46; @@ -340,8 +340,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision(6); // default }; - //testSinCosTanX( M_PIl, 1E-3, true ); // from math.h - //testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) @@ -377,25 +377,36 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) { - auto toleranceForX = []( const double ) + auto toleranceForX = [distance4]( const double x ) { - return 2E-3; // NB: tolerance 1E-3 is not enough for cos( x=-7.8539816339744828 ) with 100 steps in [-4*pi,6*pi] + const double d4 = distance4( x ); + if ( d4 < 1E-14 ) return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if ( d4 < 1E-13 ) return 1E-04; + else if ( d4 < 1E-12 ) return 1E-05; + else if ( d4 < 1E-11 ) return 1E-06; + else if ( d4 < 1E-10 ) return 1E-07; + else if ( d4 < 1E-09 ) return 1E-08; + else if ( d4 < 1E-08 ) return 1E-09; + else if ( d4 < 1E-07 ) return 1E-10; + else if ( d4 < 1E-06 ) return 1E-11; + else if ( d4 < 1E-05 ) return 1E-12; + else if ( d4 < 1E-04 ) return 1E-13; + else return 1E-14; // play it safe even if the agreement might even be better? }; for ( int istep = 0; istep < nstep + 1; istep++ ) { double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) const double tolerance = toleranceForX( x ); - EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); - EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); - EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); - const bool debug = false; - testSinCosTanX( x, tolerance, debug, istep ); // strangely, this succeeds instead! } }; - testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); /* // Test constexpr atan From 5147422f591225dccbed69929bc79c896938fb25 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 12:18:38 +0100 Subject: [PATCH 39/96] [susy2] in susyggtt.sa constexpr_math.h, fix atan after more intense testing --- epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h index 14677f6bda..d48795b8b0 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h @@ -202,15 +202,15 @@ namespace mg5amcCpu constexpr long double constexpr_atan_quad( const long double xx ) { if( xx > 1 ) - return atanTaylor( 1 / xx ); + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); else if( xx == 1 ) return constexpr_pi_by_4; else if( xx > -1 ) return atanTaylor( xx ); else if( xx == -1 ) return -constexpr_pi_by_4; - else if( xx < -1 ) - return atanTaylor( 1 / xx ); + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); } // Constexpr implementation of atan (double signature, internally implemented as long double) From 6c55ff8d3920e1bdcd110ce04745a566204fd8e4 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 12:18:51 +0100 Subject: [PATCH 40/96] [susy2] in susyggtt.sa, complete testmisc.cc with tests for atan --- .../cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc index c27efd382b..867763cd7c 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc @@ -408,21 +408,18 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); - /* // Test constexpr atan { - const int nstep = 40; + const double tolerance = 1E-12; + const int nstep = 1000; for ( int istep = 0; istep < nstep + 1; istep++ ) { - long double x0 = -2, x1 = +2; + long double x0 = -5, x1 = +5; double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) - std::cout << std::endl; - std::cout << "x = " << x << std::endl; - std::cout << "atan = " << std::tan( x ) << std::endl; - std::cout << "constexpr_atan = " << constexpr_tan( x ) << std::endl; + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; } } - */ //-------------------------------------------------------------------------- From e1e62eb12556c0cac6bb63d8a9817b36bca99fc8 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 12:28:34 +0100 Subject: [PATCH 41/96] [susy2] in CODEGEN, add constexpr_math.h from susyggtt.sa, with the implementation of constexpr trig functions for HRDCOD=1 cuda builds (#627) Also add it to output.py in CODEGEN --- .../template_files/gpu/constexpr_math.h | 223 ++++++++++++++++++ .../PLUGIN/CUDACPP_SA_OUTPUT/output.py | 1 + 2 files changed, 224 insertions(+) create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h new file mode 100644 index 0000000000..d48795b8b0 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx <= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx < 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index 85e49ffba9..2526dd73c5 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -91,6 +91,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'], 'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc', s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h', + s+'gpu/constexpr_math.h', s+'CMake/src/CMakeLists.txt' ], 'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h', s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h', From 8cdd587cb47231d46ef6b9a495adadeb59a85a63 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 12:32:40 +0100 Subject: [PATCH 42/96] [susy2] in CODEGEN, fix clang formatting in constexpr_math.h --- .../iolibs/template_files/gpu/constexpr_math.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h index d48795b8b0..78ff8b16ab 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h @@ -25,7 +25,7 @@ namespace mg5amcGpu #else namespace mg5amcCpu #endif -{ +{ // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) { @@ -58,7 +58,7 @@ namespace mg5amcCpu } // PI from cmath - constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi = M_PIl; // pi constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 @@ -76,9 +76,9 @@ namespace mg5amcCpu long double sinxlast = sinx; sinx += delta; #ifdef CONSTEXPR_MATH_DEBUG - std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx < +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,128 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision(6); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for ( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if ( d4 < 1E-14 ) return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if ( d4 < 1E-13 ) return 1E-04; + else if ( d4 < 1E-12 ) return 1E-05; + else if ( d4 < 1E-11 ) return 1E-06; + else if ( d4 < 1E-10 ) return 1E-07; + else if ( d4 < 1E-09 ) return 1E-08; + else if ( d4 < 1E-08 ) return 1E-09; + else if ( d4 < 1E-07 ) return 1E-10; + else if ( d4 < 1E-06 ) return 1E-11; + else if ( d4 < 1E-05 ) return 1E-12; + else if ( d4 < 1E-04 ) return 1E-13; + else return 1E-14; // play it safe even if the agreement might even be better? + }; + for ( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for ( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- + } From edc50059259e22bc8db7bd54fc70c4d46a372c5e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 12:36:30 +0100 Subject: [PATCH 44/96] [susy2] in CODEGEN cudacpp.mk, add optional quadmath dependency for constexpr_math.h tests from susyggtt.sa (#627) --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 3 +++ 1 file changed, 3 insertions(+) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index efe82df88d..c90bc8dd88 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs From 26295a6b903c2c44ba8d64fb00d7d5a0a0f5f096 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 12:38:44 +0100 Subject: [PATCH 45/96] [susy2] in CODEGEN, fix clang formatting in testmisc.cc --- .../iolibs/template_files/gpu/testmisc.cc | 73 +++++++++++-------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc index f4c4869b50..8c29482e5a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc @@ -309,13 +309,13 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) auto distance4 = []( const long double xx ) { - const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) - const long double d0 = xx3; // distance from 0 - const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 return ( d0 < d1 ? d0 : d1 ); }; - + // Test constexpr sin, cos, tan - specific, problematic, points auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) { @@ -338,14 +338,14 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; - std::cout << std::setprecision(6); // default + std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h - testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) - testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) - testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) - + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) { @@ -353,19 +353,19 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { const bool debug = false; const int nstep = 8; - for ( int istep = 0; istep < nstep + 1; istep++ ) + for( int istep = 0; istep < nstep + 1; istep++ ) { long double x0 = deltax * ioff; long double x1 = deltax * ioff + 2 * constexpr_pi; double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) testSinCosTanX( x, tolerance, debug, istep ); } - } + } }; // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] - testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 @@ -380,29 +380,41 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) auto toleranceForX = [distance4]( const double x ) { const double d4 = distance4( x ); - if ( d4 < 1E-14 ) return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... - else if ( d4 < 1E-13 ) return 1E-04; - else if ( d4 < 1E-12 ) return 1E-05; - else if ( d4 < 1E-11 ) return 1E-06; - else if ( d4 < 1E-10 ) return 1E-07; - else if ( d4 < 1E-09 ) return 1E-08; - else if ( d4 < 1E-08 ) return 1E-09; - else if ( d4 < 1E-07 ) return 1E-10; - else if ( d4 < 1E-06 ) return 1E-11; - else if ( d4 < 1E-05 ) return 1E-12; - else if ( d4 < 1E-04 ) return 1E-13; - else return 1E-14; // play it safe even if the agreement might even be better? + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? }; - for ( int istep = 0; istep < nstep + 1; istep++ ) + for( int istep = 0; istep < nstep + 1; istep++ ) { double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) const double tolerance = toleranceForX( x ); EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) - << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) - << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) - << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); } }; testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) @@ -412,7 +424,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { const double tolerance = 1E-12; const int nstep = 1000; - for ( int istep = 0; istep < nstep + 1; istep++ ) + for( int istep = 0; istep < nstep + 1; istep++ ) { long double x0 = -5, x1 = +5; double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) @@ -422,5 +434,4 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- - } From 8671c27847a5d42d9b6b093bd82206297cb9110c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 12:39:30 +0100 Subject: [PATCH 46/96] [susy2] in susy_gg_tt.sa, fix clang formatting and minor details as in the CODEGEN backport --- .../susy_gg_tt.sa/SubProcesses/cudacpp.mk | 2 +- .../susy_gg_tt.sa/SubProcesses/testmisc.cc | 75 +++++++++++-------- .../susy_gg_tt.sa/src/constexpr_math.h | 16 ++-- 3 files changed, 52 insertions(+), 41 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk index 15e90b0d75..f7a61d3e74 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/cudacpp.mk @@ -848,7 +848,7 @@ endif endif # Test quadmath in testmisc.cc tests for constexpr_math #627 -#$(testmain): LIBFLAGS += -lquadmath +###$(testmain): LIBFLAGS += -lquadmath # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc index 867763cd7c..8c29482e5a 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc @@ -15,7 +15,7 @@ #include -#include +//#include //#include // needs C++20... https://stackoverflow.com/a/65347016 #include #include @@ -309,13 +309,13 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) auto distance4 = []( const long double xx ) { - const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) - const long double d0 = xx3; // distance from 0 - const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 return ( d0 < d1 ? d0 : d1 ); }; - + // Test constexpr sin, cos, tan - specific, problematic, points auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) { @@ -338,14 +338,14 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; - std::cout << std::setprecision(6); // default + std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h - testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) - testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) - testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) - + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) { @@ -353,19 +353,19 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { const bool debug = false; const int nstep = 8; - for ( int istep = 0; istep < nstep + 1; istep++ ) + for( int istep = 0; istep < nstep + 1; istep++ ) { long double x0 = deltax * ioff; long double x1 = deltax * ioff + 2 * constexpr_pi; double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) testSinCosTanX( x, tolerance, debug, istep ); } - } + } }; // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] - testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 @@ -380,29 +380,41 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) auto toleranceForX = [distance4]( const double x ) { const double d4 = distance4( x ); - if ( d4 < 1E-14 ) return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... - else if ( d4 < 1E-13 ) return 1E-04; - else if ( d4 < 1E-12 ) return 1E-05; - else if ( d4 < 1E-11 ) return 1E-06; - else if ( d4 < 1E-10 ) return 1E-07; - else if ( d4 < 1E-09 ) return 1E-08; - else if ( d4 < 1E-08 ) return 1E-09; - else if ( d4 < 1E-07 ) return 1E-10; - else if ( d4 < 1E-06 ) return 1E-11; - else if ( d4 < 1E-05 ) return 1E-12; - else if ( d4 < 1E-04 ) return 1E-13; - else return 1E-14; // play it safe even if the agreement might even be better? + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? }; - for ( int istep = 0; istep < nstep + 1; istep++ ) + for( int istep = 0; istep < nstep + 1; istep++ ) { double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) const double tolerance = toleranceForX( x ); EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) - << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) - << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) - << std::setprecision(40) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); } }; testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) @@ -412,7 +424,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { const double tolerance = 1E-12; const int nstep = 1000; - for ( int istep = 0; istep < nstep + 1; istep++ ) + for( int istep = 0; istep < nstep + 1; istep++ ) { long double x0 = -5, x1 = +5; double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) @@ -422,5 +434,4 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- - } diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h index d48795b8b0..78ff8b16ab 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h @@ -25,7 +25,7 @@ namespace mg5amcGpu #else namespace mg5amcCpu #endif -{ +{ // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) { @@ -58,7 +58,7 @@ namespace mg5amcCpu } // PI from cmath - constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi = M_PIl; // pi constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 @@ -76,9 +76,9 @@ namespace mg5amcCpu long double sinxlast = sinx; sinx += delta; #ifdef CONSTEXPR_MATH_DEBUG - std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx <( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; From 93cf463339fd087b00f446053d75c1b2a4cb740c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 12:46:53 +0100 Subject: [PATCH 48/96] [susy2] in CODEGEN model.py, ensure that constexpr math functions are used #627 Also add a comment about std::complex (replace is done twice) The differences in generated code are now the following: ______________________________________________________________________________ git diff /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc /tmp/git-blob-dFUXLJ/CPPProcess.cc 0651b576632e9d50c3ed2bf9461bd41309c90821 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc 0000000000000000000000000000000000000000 100644 79c79 < __device__ const fptype cIPD[3] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT, (fptype)Parameters_MSSM_SLHA2::mdl_I51x11 }; --- > __device__ const fptype cIPD[2] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT }; 83c83 < __device__ __constant__ fptype cIPD[3]; --- > __device__ __constant__ fptype cIPD[2]; 86c86 < static fptype cIPD[3]; --- > static fptype cIPD[2]; 505c505 < const fptype tIPD[3] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT, (fptype)m_pars->mdl_I51x11 }; --- > const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; 508c508 < gpuMemcpyToSymbol( cIPD, tIPD, 3 * sizeof( fptype ) ); --- > gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); 511c511 < memcpy( cIPD, tIPD, 3 * sizeof( fptype ) ); --- > memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); 514c514 < //for ( i=0; i<3; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; --- > //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; ______________________________________________________________________________ git diff /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h /tmp/git-blob-fVhC4J/Parameters_MSSM_SLHA2.h 2805ef7d81bd8340b2a77b2ec16df7639165e30c 100644 epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h 0000000000000000000000000000000000000000 100644 23,24d22 < #include "constexpr_math.h" < 295c293 < __device__ constexpr double mdl_I51x11 = 1.; --- > constexpr double mdl_I51x11 = 1.; 370c368 < constexpr cxsmpl mdl_bb = ( ( -mdl_mHd2 + mdl_mHu2 - mdl_MZ__exp__2 * constexpr_cos( 2. * mdl_beta ) ) * constexpr_tan( 2. * mdl_beta ) ) / 2.; --- > constexpr cxsmpl mdl_bb = ( ( -mdl_mHd2 + mdl_mHu2 - mdl_MZ__exp__2 * constexpr_cos( 2. * mdl_beta ) ) * tan( 2. * mdl_beta ) ) / 2.; 801,802c799 < //const double mdl_I51x11 = Parameters_MSSM_SLHA2::getInstance()->mdl_I51x11; // fix HRDCOD=0 susy builds < const fptype mdl_I51x11 = cIPD[2]; // fix HRDCOD=0 susy builds --- > const fptype mdl_I51x11; --- .../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 74d9c35bd8..96ecb83583 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -721,9 +721,13 @@ def write_hardcoded_parameters(self, params): res = '// (none)\n' return res pardef = pardef.replace('std::complex<','cxsmpl<') # custom simplex complex class (with constexpr arithmetics) - parset = parset.replace('std::complex<','cxsmpl<') # custom simplex complex class (with constexpr arithmetics) + parset = parset.replace('std::complex<','cxsmpl<') # NB this is really needed twice! (if there are two std::complex on the same line) parset = parset.replace('sqrt(','constexpr_sqrt(') # constexpr sqrt (based on iterative Newton-Raphson approximation) - parset = parset.replace('pow(','constexpr_pow(') # constexpr sqrt (based on iterative Newton-Raphson approximation) + parset = parset.replace('pow(','constexpr_pow(') # constexpr pow + parset = parset.replace('atan(','constexpr_atan(') # constexpr atan for BSM #627 + parset = parset.replace('sin(','constexpr_sin(') # constexpr sin for BSM #627 + parset = parset.replace('cos(','constexpr_cos(') # constexpr cos for BSM #627 + parset = parset.replace(' tan(',' constexpr_tan(') # constexpr tan for BSM #627 parset = parset.replace('(','( ') parset = parset.replace(')',' )') parset = parset.replace('+',' + ') From ccd719c6105bc1872db05082c3dec5fd1ad30516 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 13:05:20 +0100 Subject: [PATCH 49/96] [susy2] progress in CODEGEN: mark mdl_I51x11 as device parameter (this requires moving earlier on the mechanism to identify parameters such as this one) --- .../CUDACPP_SA_OUTPUT/model_handling.py | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 96ecb83583..d995004a57 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -705,7 +705,7 @@ def write_set_parameters(self, params): return res # AV - new method (merging write_parameters and write_set_parameters) - def write_hardcoded_parameters(self, params): + def write_hardcoded_parameters(self, params, deviceparams=set()): majorana_widths = [] for particle in self.model.get('particles'): if particle.is_fermion() and particle.get('self_antipart') and \ @@ -747,6 +747,8 @@ def write_hardcoded_parameters(self, params): ###print(len(pardef_lines), par) # for debugging if par in majorana_widths: pardef_lines[par] = ( 'constexpr ' + type + ' ' + par + "_abs" ) + elif par in deviceparams: + pardef_lines[par] = ( '__device__ constexpr ' + type + ' ' + par ) else: pardef_lines[par] = ( 'constexpr ' + type + ' ' + par ) ###misc.sprint( 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) ) @@ -814,6 +816,19 @@ def super_write_set_parameters_onlyfixMajorana(self, hardcoded): # FIXME! split # AV - replace export_cpp.UFOModelConverterCPP method (add hardcoded parameters and couplings) def super_generate_parameters_class_files(self): """Create the content of the Parameters_model.h and .cc files""" + # First of all, identify which extra independent parameters must be made available through CPU static and GPU constant memory in BSM models + # because they are used in the event by event calculation of alphaS-dependent couplings + # WARNING! This is only implemented and has only been tested so far for real parameters (complex parameters need twice the storage) + if self.model_name[:2] != 'sm' : + param_indep_real_used = [] + for param in self.params_indep: + if param.type == 'real': + for coup in self.coups_dep.values(): + if param.name in coup.expr: + param_indep_real_used.append( param.name ) + param_indep_real_used = set( param_indep_real_used ) + misc.sprint('PIPPO!', param_indep_real_used ) + # Then do everything else replace_dict = self.default_replace_dict replace_dict['info_lines'] = PLUGIN_export_cpp.get_mg5_info_lines() replace_dict['model_name'] = self.model_name @@ -845,7 +860,7 @@ def super_generate_parameters_class_files(self): assert super().write_parameters([]) == '', 'super().write_parameters([]) is not empty' # AV sanity check (#622) assert self.super_write_set_parameters_donotfixMajorana([]) == '', 'super_write_set_parameters_donotfixMajorana([]) is not empty' # AV sanity check (#622) ###misc.sprint(self.params_indep) # for debugging - hrd_params_indep = [ line.replace('constexpr','//constexpr') + ' // now retrieved event-by-event (as G) from Fortran (running alphas #373)' if 'aS =' in line else line for line in self.write_hardcoded_parameters(self.params_indep).split('\n') ] + hrd_params_indep = [ line.replace('constexpr','//constexpr') + ' // now retrieved event-by-event (as G) from Fortran (running alphas #373)' if 'aS =' in line else line for line in self.write_hardcoded_parameters(self.params_indep,param_indep_real_used).split('\n') ] # use param_indep_real_used as deviceparams replace_dict['hardcoded_independent_parameters'] = '\n'.join( hrd_params_indep ) + self.super_write_set_parameters_onlyfixMajorana( hardcoded=True ) # add fixes for Majorana particles only in the aS-indep parameters #622 ###misc.sprint(self.coups_indep) # for debugging replace_dict['hardcoded_independent_couplings'] = self.write_hardcoded_parameters(self.coups_indep) @@ -904,18 +919,6 @@ def super_generate_parameters_class_files(self): replace_dict['dcoupsetdpar2'] = ' // (none)' replace_dict['dcoupsetdcoup2'] = ' // (none)' replace_dict['dcoupoutdcoup2'] = '' - # Identify which extra independent parameters must be made available through CPU static and GPU constant memory in BSM/EFT/SUSY models - # because they are used in the event by event calculation of alphaS-dependent couplings - # WARNING! This is only implemented and has only been tested so far for real parameters (complex parameters need twice the storage) - if self.model_name[:2] != 'sm' : - param_indep_real_used = [] - for param in self.params_indep: - if param.type == 'real': - for coup in self.coups_dep.values(): - if param.name in coup.expr: - param_indep_real_used.append( param.name ) - param_indep_real_used = set( param_indep_real_used ) - misc.sprint('PIPPO!', param_indep_real_used ) # Require HRDCOD=1 in EFT and special handling in EFT for fptype=float using SIMD if self.model_name[:2] == 'sm' : replace_dict['eftwarn0'] = '' From 2ffa7e51d57396f73a27990649236fb72361fe2a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 13:10:53 +0100 Subject: [PATCH 50/96] [susy2] in CODEGEN, add constexpr_math.h to the Parameters.h template --- .../madgraph/iolibs/template_files/cpp_model_parameters_h.inc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index c5a74d6071..97afb8b17f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -18,6 +18,8 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== #ifndef MGONGPU_HARDCODE_PARAM%(eftwarn0)s From 59c80b611e541e6189fe8a030ba556810a829b0b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 13:16:23 +0100 Subject: [PATCH 51/96] [susy2] in CODEGEN model.py, fix the 'tan(' replacement with constexpr The differences are now reduced to this ______________________________________________________________________________ git diff /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc /tmp/git-blob-YTajRy/CPPProcess.cc 0651b576632e9d50c3ed2bf9461bd41309c90821 100644 epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc 0000000000000000000000000000000000000000 100644 79c79 < __device__ const fptype cIPD[3] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT, (fptype)Parameters_MSSM_SLHA2::mdl_I51x11 }; --- > __device__ const fptype cIPD[2] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT }; 83c83 < __device__ __constant__ fptype cIPD[3]; --- > __device__ __constant__ fptype cIPD[2]; 86c86 < static fptype cIPD[3]; --- > static fptype cIPD[2]; 505c505 < const fptype tIPD[3] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT, (fptype)m_pars->mdl_I51x11 }; --- > const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; 508c508 < gpuMemcpyToSymbol( cIPD, tIPD, 3 * sizeof( fptype ) ); --- > gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); 511c511 < memcpy( cIPD, tIPD, 3 * sizeof( fptype ) ); --- > memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); 514c514 < //for ( i=0; i<3; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; --- > //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; ______________________________________________________________________________ git diff /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h /tmp/git-blob-yCgmp8/Parameters_MSSM_SLHA2.h 2805ef7d81bd8340b2a77b2ec16df7639165e30c 100644 epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h 0000000000000000000000000000000000000000 100644 801,802c801 < //const double mdl_I51x11 = Parameters_MSSM_SLHA2::getInstance()->mdl_I51x11; // fix HRDCOD=0 susy builds < const fptype mdl_I51x11 = cIPD[2]; // fix HRDCOD=0 susy builds --- > const fptype mdl_I51x11; --- .../cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index d995004a57..085b0d58fe 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -727,7 +727,7 @@ def write_hardcoded_parameters(self, params, deviceparams=set()): parset = parset.replace('atan(','constexpr_atan(') # constexpr atan for BSM #627 parset = parset.replace('sin(','constexpr_sin(') # constexpr sin for BSM #627 parset = parset.replace('cos(','constexpr_cos(') # constexpr cos for BSM #627 - parset = parset.replace(' tan(',' constexpr_tan(') # constexpr tan for BSM #627 + parset = parset.replace('tan(','constexpr_tan(').replace('aconstexpr_tan(','atan(') # constexpr tan for BSM #627 parset = parset.replace('(','( ') parset = parset.replace(')',' )') parset = parset.replace('+',' + ') From 5fc28981e05a5010227f325d742e43a1bf005a3a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 15:33:05 +0100 Subject: [PATCH 52/96] [susy2] in susy_gg_tt.sa, restructure the handling of BSM parameters to try and make this easier to code generate (NB: buils and tests are ok for both HARDCOD=0 and =1) --- .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 27 ++++++++++--------- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 26 +++++++++++------- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 9 +++---- 3 files changed, 35 insertions(+), 27 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 1d7052d0ab..2208a5ec31 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.117 s +1 processes with 3 diagrams generated in 0.116 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -562,33 +562,34 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 160]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 165]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 194]  -DEBUG: type(subproc_group)= [output.py at line 195]  -DEBUG: type(fortran_model)= [output.py at line 196]  -DEBUG: type(me)= me=0 [output.py at line 197]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 198]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 199]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 203]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.139 s +ALOHA: aloha creates 2 routines in 0.137 s VVV1 FFV1 FFV1 FFV1 FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. +DEBUG: 'PIPPO!', param_indep_real_used =  PIPPO! {'mdl_I51x11'} [model_handling.py at line 830]  super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h @@ -597,7 +598,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.273s -user 0m1.199s -sys 0m0.065s +real 0m1.484s +user 0m1.197s +sys 0m0.068s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index 0651b57663..9786b5b555 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -76,15 +77,18 @@ namespace mg5amcCpu // However, physics parameters are user-defined through card files: use CUDA constant memory instead (issue #39) // [NB if hardcoded parameters are used, it's better to define them here to avoid silent shadowing (issue #263)] #ifdef MGONGPU_HARDCODE_PARAM - __device__ const fptype cIPD[3] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT, (fptype)Parameters_MSSM_SLHA2::mdl_I51x11 }; + __device__ const fptype cIPD[2] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 + __device__ const fptype bsmIndepParam[1] = { (fptype)Parameters_MSSM_SLHA2::mdl_I51x11 }; #else #ifdef MGONGPUCPP_GPUIMPL - __device__ __constant__ fptype cIPD[3]; + __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 + __device__ __constant__ fptype bsmIndepParam[1]; #else - static fptype cIPD[3]; + static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 + static fptype bsmIndepParam[1]; #endif #endif @@ -502,16 +506,20 @@ namespace mg5amcCpu m_masses.push_back( m_pars->mdl_MT ); // Read physics parameters like masses and couplings from user configuration files (static: initialize once) // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory - const fptype tIPD[3] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT, (fptype)m_pars->mdl_I51x11 }; + const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 + const fptype bsmIndepParamTMP[1] = { (fptype)m_pars->mdl_I51x11 }; #ifdef MGONGPUCPP_GPUIMPL - gpuMemcpyToSymbol( cIPD, tIPD, 3 * sizeof( fptype ) ); + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 + gpuMemcpyToSymbol( bsmIndepParam, bsmIndepParamTMP, 1 * sizeof( fptype ) ); #else - memcpy( cIPD, tIPD, 3 * sizeof( fptype ) ); + memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 + memcpy( bsmIndepParam, bsmIndepParamTMP, 1 * sizeof( fptype ) ); #endif - //for ( i=0; i<3; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + for ( int i=0; i<1; i++ ) std::cout << std::setprecision(17) << "bsmIndepParamTMP[i] = " << bsmIndepParamTMP[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -622,7 +630,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, cIPD ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -632,7 +640,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, cIPD ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 2805ef7d81..aec9b9ecbe 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -793,13 +793,12 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* cIPD ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_MSSM_SLHA2; #else - //const double mdl_I51x11 = Parameters_MSSM_SLHA2::getInstance()->mdl_I51x11; // fix HRDCOD=0 susy builds - const fptype mdl_I51x11 = cIPD[2]; // fix HRDCOD=0 susy builds + const fptype mdl_I51x11 = bsmIndepParamPtr[0]; #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_MSSM_SLHA2) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -871,12 +870,12 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* cIPD ) + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_MSSM_SLHA2_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, cIPD ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_6s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_6 ); fptype* GC_51s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_51 ); cxtype_sv_ref GC_6s_sv = C_ACCESS::kernelAccess( GC_6s ); From 8c1f38c95ca47b7934c3eb184fc027d4d8df0ab3 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 17:05:53 +0100 Subject: [PATCH 53/96] [susy2] in susy_gg_tt.sa, failed attempt at further restructuring the handling of BSM parameters to try and make this easier to code generate (NB: buils and tests are ok > For HRDCOD=1 this builds and runs ok For HRDCOD=0 this builds but it fails at runtime with invalid device memory access --- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 16 +++++++++------- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc | 2 ++ .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 6 ++++++ 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index 9786b5b555..ed56af96ff 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -79,16 +79,17 @@ namespace mg5amcCpu #ifdef MGONGPU_HARDCODE_PARAM __device__ const fptype cIPD[2] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 - __device__ const fptype bsmIndepParam[1] = { (fptype)Parameters_MSSM_SLHA2::mdl_I51x11 }; + //__device__ const double* bsmIndepParam[1] = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; #else #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 - __device__ __constant__ fptype bsmIndepParam[1]; + __device__ __constant__ double bsmIndepParam[1]; #else static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 - static fptype bsmIndepParam[1]; + static double bsmIndepParam[1]; #endif #endif @@ -508,18 +509,19 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 - const fptype bsmIndepParamTMP[1] = { (fptype)m_pars->mdl_I51x11 }; #ifdef MGONGPUCPP_GPUIMPL gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 - gpuMemcpyToSymbol( bsmIndepParam, bsmIndepParamTMP, 1 * sizeof( fptype ) ); + for( int iibsmp=0; iibsmp<1; iibsmp++ ) + gpuMemcpyToSymbol( &(bsmIndepParam[iibsmp]), &(m_pars->mdl_bsmIndepParamPtr[iibsmp]), sizeof( double ) ); #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 - memcpy( bsmIndepParam, bsmIndepParamTMP, 1 * sizeof( fptype ) ); + for( int iibsmp=0; iibsmp<1; iibsmp++ ) + memcpy( &(bsmIndepParam[iibsmp]), &(m_pars->mdl_bsmIndepParamPtr[iibsmp]), sizeof( double ) ); #endif //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; - for ( int i=0; i<1; i++ ) std::cout << std::setprecision(17) << "bsmIndepParamTMP[i] = " << bsmIndepParamTMP[i] << std::endl; + for ( int i=0; i<1; i++ ) std::cout << std::setprecision(17) << "m_pars->mdl_bsmIndepParamPtr[i] = " << m_pars->mdl_bsmIndepParamPtr[i] << std::endl; } #else // Initialize process (with hardcoded parameters) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc index fe3cec4f0f..5f2782d269 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc @@ -815,6 +815,8 @@ Parameters_MSSM_SLHA2::setIndependentParameters( SLHAReader& slha ) mdl_Wneu4 = -abs( mdl_Wneu4 ); if( mdl_Mgo < 0 ) mdl_Wgo = -abs( mdl_Wgo ); + // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; + //mdl_bsmIndepParam = { mdl_I51x11 }; } void diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index aec9b9ecbe..0733efd1a9 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -84,6 +84,9 @@ namespace mg5amcCpu // Print couplings that are changed event by event //void printDependentCouplings(); // now computed event-by-event (running alphas #373) + // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; + const double* mdl_bsmIndepParamPtr[1] = { &mdl_I51x11 }; + private: static Parameters_MSSM_SLHA2* instance; @@ -760,6 +763,9 @@ namespace mg5amcCpu // Print couplings that are changed event by event //void printDependentCouplings(); // now computed event-by-event (running alphas #373) + + // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; + __device__ constexpr double mdl_bsmIndepParam[1] = { mdl_I51x11 }; } } // end namespace mg5amcGpu/mg5amcCpu From 322b42346eed6720215d04559f351de3946bc498 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 17:15:04 +0100 Subject: [PATCH 54/96] [susy2] in susy_gg_tt.sa, final(?) restructuring of the handling of BSM parameters to try and make this easier to code generate (NB: buils and tests are ok > --- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 13 +++++-------- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc | 2 +- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 6 ++++-- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index ed56af96ff..bceccd38c7 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -79,17 +79,16 @@ namespace mg5amcCpu #ifdef MGONGPU_HARDCODE_PARAM __device__ const fptype cIPD[2] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 - //__device__ const double* bsmIndepParam[1] = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; #else #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 - __device__ __constant__ double bsmIndepParam[1]; + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; #else static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 - static double bsmIndepParam[1]; + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; #endif #endif @@ -512,16 +511,14 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 - for( int iibsmp=0; iibsmp<1; iibsmp++ ) - gpuMemcpyToSymbol( &(bsmIndepParam[iibsmp]), &(m_pars->mdl_bsmIndepParamPtr[iibsmp]), sizeof( double ) ); + gpuMemcpyToSymbol( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) ); #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 - for( int iibsmp=0; iibsmp<1; iibsmp++ ) - memcpy( &(bsmIndepParam[iibsmp]), &(m_pars->mdl_bsmIndepParamPtr[iibsmp]), sizeof( double ) ); + memcpy( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) ); #endif //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; - for ( int i=0; i<1; i++ ) std::cout << std::setprecision(17) << "m_pars->mdl_bsmIndepParamPtr[i] = " << m_pars->mdl_bsmIndepParamPtr[i] << std::endl; + for ( int i=0; imdl_bsmIndepParam[i] = " << m_pars->mdl_bsmIndepParam[i] << std::endl; } #else // Initialize process (with hardcoded parameters) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc index 5f2782d269..5438997b4f 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.cc @@ -816,7 +816,7 @@ Parameters_MSSM_SLHA2::setIndependentParameters( SLHAReader& slha ) if( mdl_Mgo < 0 ) mdl_Wgo = -abs( mdl_Wgo ); // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; - //mdl_bsmIndepParam = { mdl_I51x11 }; + mdl_bsmIndepParam[0] = mdl_I51x11; } void diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 0733efd1a9..20cb301363 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -85,7 +85,8 @@ namespace mg5amcCpu //void printDependentCouplings(); // now computed event-by-event (running alphas #373) // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; - const double* mdl_bsmIndepParamPtr[1] = { &mdl_I51x11 }; + static constexpr int nBsmIndepParam = 1; + double mdl_bsmIndepParam[ nBsmIndepParam ]; private: @@ -765,7 +766,8 @@ namespace mg5amcCpu //void printDependentCouplings(); // now computed event-by-event (running alphas #373) // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; - __device__ constexpr double mdl_bsmIndepParam[1] = { mdl_I51x11 }; + constexpr int nBsmIndepParam = 1; + __device__ constexpr double mdl_bsmIndepParam[ nBsmIndepParam ] = { mdl_I51x11 }; } } // end namespace mg5amcGpu/mg5amcCpu From 8521dd83e9a2ed450820f6a6f75713a8e61c6468 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 17:55:09 +0100 Subject: [PATCH 55/96] [susy2] in susy_gg_tt.sa, further change to ease code generation: add "#define MGONGPUCPP_NBSMINDEPPARAM_GT_0 1" if nBsm>0 This ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated There may be a better way, but this will do for the moment... --- .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 15 +++++++++++++-- .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 4 ++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index bceccd38c7..0a10c278ef 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -79,17 +79,28 @@ namespace mg5amcCpu #ifdef MGONGPU_HARDCODE_PARAM __device__ const fptype cIPD[2] = { (fptype)Parameters_MSSM_SLHA2::mdl_MT, (fptype)Parameters_MSSM_SLHA2::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 - __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; #else #ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 - __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; #else static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 +#endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; #endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 20cb301363..b9de715f48 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -24,6 +24,10 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +#define MGONGPUCPP_NBSMINDEPPARAM_GT_0 1 + #ifndef MGONGPU_HARDCODE_PARAM //#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=0 builds (#439 and PR #625) From a4cac2b305b6a9d5beef9a85a40237c8b9f9e025 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 18:07:57 +0100 Subject: [PATCH 56/96] [susy2] in susy_gg_tt.sa, further changes to ease code generation: add another check if nBsm>0 --- .../SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index 0a10c278ef..bf63d1a80e 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -522,14 +522,16 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 - gpuMemcpyToSymbol( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) ); + if( Parameters_MSSM_SLHA2::nBsmIndepParam > 0 ) + gpuMemcpyToSymbol( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) ); #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 - memcpy( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) ); + if( Parameters_MSSM_SLHA2::nBsmIndepParam > 0 ) + memcpy( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) ); #endif //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; - for ( int i=0; imdl_bsmIndepParam[i] = " << m_pars->mdl_bsmIndepParam[i] << std::endl; + //for ( int i=0; imdl_bsmIndepParam[i] = " << m_pars->mdl_bsmIndepParam[i] << std::endl; } #else // Initialize process (with hardcoded parameters) From fbeee37eab16e669498555ef106f7a3249d79b43 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 18:51:50 +0100 Subject: [PATCH 57/96] [susy2] in CODEGEN, finally complete the backport from susy_gg_tt.sa! --- .../template_files/cpp_model_parameters_h.inc | 12 ++--- .../iolibs/template_files/gpu/process_cc.inc | 1 + .../gpu/process_function_definitions.inc | 12 ++--- .../CUDACPP_SA_OUTPUT/model_handling.py | 53 +++++++++++++++++-- 4 files changed, 62 insertions(+), 16 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index 97afb8b17f..972cebc051 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -20,7 +20,7 @@ #include "constexpr_math.h" -//========================================================================== +//==========================================================================%(bsmdefine)s #ifndef MGONGPU_HARDCODE_PARAM%(eftwarn0)s @@ -72,7 +72,7 @@ namespace mg5amcCpu //void printDependentParameters(); // now computed event-by-event (running alphas #373) // Print couplings that are changed event by event - //void printDependentCouplings(); // now computed event-by-event (running alphas #373) + //void printDependentCouplings(); // now computed event-by-event (running alphas #373)%(bsmip0)s private: @@ -116,7 +116,7 @@ namespace mg5amcCpu //void printDependentParameters(); // now computed event-by-event (running alphas #373) // Print couplings that are changed event by event - //void printDependentCouplings(); // now computed event-by-event (running alphas #373) + //void printDependentCouplings(); // now computed event-by-event (running alphas #373)%(bsmip1)s } } // end namespace mg5amcGpu/mg5amcCpu @@ -148,7 +148,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* cIPD ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_%(model_name)s; @@ -195,12 +195,12 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* cIPD ) + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_%(model_name)s_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, cIPD ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); %(dcoupaccessbuffer)s%(dcoupkernelaccess)s%(dcoupcompute)s mgDebug( 1, __FUNCTION__ ); return; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc index 815fd8d5b7..1c0f433860 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc @@ -32,6 +32,7 @@ #include #include #include +#include #include #include diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 3848d35d40..320d33cc45 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -51,7 +51,7 @@ namespace mg5amcCpu %(cipdstatic)s %(cipcstatic)s #endif -#endif +#endif%(bsmindepparam)s // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL @@ -119,11 +119,11 @@ namespace mg5amcCpu %(cipcassign)s #ifdef MGONGPUCPP_GPUIMPL %(cipd2tipdSym)s - %(cipc2tipcSym)s + %(cipc2tipcSym)s%(bsmMemcpySym)s #else %(cipd2tipd)s - %(cipc2tipc)s -#endif%(cipddump)s%(cipcdump)s + %(cipc2tipc)s%(bsmMemcpy)s +#endif%(cipddump)s%(cipcdump)s%(bsmdump)s } #else // Initialize process (with hardcoded parameters) @@ -230,7 +230,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings, cIPD ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -240,7 +240,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings, cIPD ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 085b0d58fe..1c5c40adfd 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -827,7 +827,6 @@ def super_generate_parameters_class_files(self): if param.name in coup.expr: param_indep_real_used.append( param.name ) param_indep_real_used = set( param_indep_real_used ) - misc.sprint('PIPPO!', param_indep_real_used ) # Then do everything else replace_dict = self.default_replace_dict replace_dict['info_lines'] = PLUGIN_export_cpp.get_mg5_info_lines() @@ -845,6 +844,13 @@ def super_generate_parameters_class_files(self): line for line in self.write_set_parameters(self.params_indep).split('\n') ] replace_dict['set_independent_parameters'] = '\n'.join( set_params_indep ) replace_dict['set_independent_parameters'] += self.super_write_set_parameters_onlyfixMajorana( hardcoded=False ) # add fixes for Majorana particles only in the aS-indep parameters #622 + if self.model_name[:2] != 'sm' : + replace_dict['set_independent_parameters'] += '\n // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings;' + if len(param_indep_real_used) > 0: + for par in param_indep_real_used: + replace_dict['set_independent_parameters'] += '\n mdl_bsmIndepParam[0] = %s;'%par + else: + replace_dict['set_independent_parameters'] += '\n // (none)' replace_dict['set_independent_couplings'] = self.write_set_parameters(self.coups_indep) replace_dict['set_dependent_parameters'] = self.write_set_parameters(self.params_dep) replace_dict['set_dependent_couplings'] = self.write_set_parameters(list(self.coups_dep.values())) @@ -921,18 +927,34 @@ def super_generate_parameters_class_files(self): replace_dict['dcoupoutdcoup2'] = '' # Require HRDCOD=1 in EFT and special handling in EFT for fptype=float using SIMD if self.model_name[:2] == 'sm' : + replace_dict['bsmdefine'] = '' + replace_dict['bsmip0'] = '' + replace_dict['bsmip1'] = '' replace_dict['eftwarn0'] = '' replace_dict['eftwarn1'] = '' replace_dict['eftspecial0'] = ' // SM implementation - no special handling of non-hardcoded parameters (PR #625)' replace_dict['eftspecial1'] = ' // Begin SM implementation - no special handling of vectors of floats as in EFT (#439)' replace_dict['eftspecial2'] = ' // End SM implementation - no special handling of vectors of floats as in EFT (#439)' else: + replace_dict['bsmdefine'] = ''' + +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +%s''' % ( '#define MGONGPUCPP_NBSMINDEPPARAM_GT_0 1' if len( param_indep_real_used ) > 0 else '#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0' ) + replace_dict['bsmip0'] = ''' + // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; + static constexpr int nBsmIndepParam = %s; + %sdouble mdl_bsmIndepParam[nBsmIndepParam];''' % ( len( param_indep_real_used ), '' if len( param_indep_real_used ) > 0 else '//' ) + replace_dict['bsmip1'] = '''\n + // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; + constexpr int nBsmIndepParam = %s; + %s__device__ constexpr double mdl_bsmIndepParam[nBsmIndepParam]%s;''' % ( len( param_indep_real_used ), '' if len( param_indep_real_used ) > 0 else '//', ' = { %s }' % ', '.join( param_indep_real_used ) if len( param_indep_real_used ) > 0 else '' ) replace_dict['eftwarn0'] = '\n//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=0 builds (#439 and PR #625)' replace_dict['eftwarn1'] = '\n//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=1 builds (#439 and PR #625)' if len( param_indep_real_used ) == 0: replace_dict['eftspecial0'] = ' // No additional parameters needed in constant memory for this BSM model' else: - replace_dict['eftspecial0'] = '\n'.join( ' const fptype %s;' % param for param in param_indep_real_used ) + replace_dict['eftspecial0'] = '\n'.join( ' const fptype %s = bsmIndepParamPtr[%i];' % ( par, ipar ) for ipar, par in enumerate( param_indep_real_used ) ) replace_dict['eftspecial1'] = ' // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)' replace_dict['eftspecial1'] += '\n#if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT )' replace_dict['eftspecial2'] = """#else @@ -1150,7 +1172,7 @@ def get_process_function_definitions(self, write=True): replace_dict['cipcstatic'] = 'static fptype cIPC[%i];'%(2*len(coupling_indep)) replace_dict['cipc2tipcSym'] = 'gpuMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep) replace_dict['cipc2tipc'] = 'memcpy( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep) - replace_dict['cipcdump'] = '\n //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl;'%len(coupling_indep) + replace_dict['cipcdump'] = '\n //for ( int i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl;'%len(coupling_indep) coup_str_hrd = '__device__ const fptype cIPC[%s] = { ' % (len(coupling_indep)*2) for coup in coupling_indep : coup_str_hrd += '(fptype)Parameters_%s::%s.real(), (fptype)Parameters_%s::%s.imag(), ' % ( self.model_name, coup, self.model_name, coup ) # AV only indep! coup_str_hrd = coup_str_hrd[:-2] + ' };' @@ -1170,7 +1192,7 @@ def get_process_function_definitions(self, write=True): replace_dict['cipdstatic'] = 'static fptype cIPD[%i];'%(len(params)) replace_dict['cipd2tipdSym'] = 'gpuMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params) replace_dict['cipd2tipd'] = 'memcpy( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params) - replace_dict['cipddump'] = '\n //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl;'%len(params) + replace_dict['cipddump'] = '\n //for ( int i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl;'%len(params) param_str_hrd = '__device__ const fptype cIPD[%s] = { ' % len(params) for para in params : param_str_hrd += '(fptype)Parameters_%s::%s, ' % ( self.model_name, para ) param_str_hrd = param_str_hrd[:-2] + ' };' @@ -1183,6 +1205,29 @@ def get_process_function_definitions(self, write=True): replace_dict['cipd2tipd'] = '//memcpy( cIPD, tIPD, %i * sizeof( fptype ) ); // nparam=0'%len(params) replace_dict['cipddump'] = '' replace_dict['cipdhrdcod'] = '//__device__ const fptype* cIPD = nullptr; // unused as nparam=0' + if self.model_name[:2] == 'sm' : + replace_dict['bsmindepparam'] = '' + replace_dict['bsmMemcpySym'] = '' + replace_dict['bsmMemcpy'] = '' + replace_dict['bsmdump'] = '' + else: + replace_dict['bsmindepparam'] = '''\n + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#endif''' + replace_dict['bsmMemcpySym'] = '\n if( Parameters_MSSM_SLHA2::nBsmIndepParam > 0 )\n gpuMemcpyToSymbol( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) );' + replace_dict['bsmMemcpy'] = '\n if( Parameters_MSSM_SLHA2::nBsmIndepParam > 0 )\n memcpy( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) );' + replace_dict['bsmdump'] = '\n //for ( int i=0; imdl_bsmIndepParam[i] = " << m_pars->mdl_bsmIndepParam[i] << std::endl;' replace_dict['all_helicities'] = self.get_helicity_matrix(self.matrix_elements[0]) replace_dict['all_helicities'] = replace_dict['all_helicities'] .replace('helicities', 'tHel') color_amplitudes = [me.get_color_amplitudes() for me in self.matrix_elements] # as in OneProcessExporterCPP.get_process_function_definitions From fdd44a8f2e866ad8b5c5b700647668ee0d43198f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 18:52:32 +0100 Subject: [PATCH 58/96] [susy2] regenerate susy_gg_tt.sa, all ok at last! --- .../susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt | 9 ++++----- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 7 +++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 2208a5ec31..63369c306b 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -582,14 +582,13 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.137 s +ALOHA: aloha creates 2 routines in 0.136 s VVV1 FFV1 FFV1 FFV1 FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./HelAmps_MSSM_SLHA2.h INFO: Created file HelAmps_MSSM_SLHA2.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. -DEBUG: 'PIPPO!', param_indep_real_used =  PIPPO! {'mdl_I51x11'} [model_handling.py at line 830]  super_write_set_parameters_onlyfixMajorana (hardcoded=False) super_write_set_parameters_onlyfixMajorana (hardcoded=True) FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/./Parameters_MSSM_SLHA2.h @@ -598,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.484s -user 0m1.197s -sys 0m0.068s +real 0m1.363s +user 0m1.211s +sys 0m0.065s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index b9de715f48..8e3d187ddf 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -87,11 +87,10 @@ namespace mg5amcCpu // Print couplings that are changed event by event //void printDependentCouplings(); // now computed event-by-event (running alphas #373) - // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; static constexpr int nBsmIndepParam = 1; - double mdl_bsmIndepParam[ nBsmIndepParam ]; - + double mdl_bsmIndepParam[nBsmIndepParam]; + private: static Parameters_MSSM_SLHA2* instance; @@ -771,7 +770,7 @@ namespace mg5amcCpu // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; constexpr int nBsmIndepParam = 1; - __device__ constexpr double mdl_bsmIndepParam[ nBsmIndepParam ] = { mdl_I51x11 }; + __device__ constexpr double mdl_bsmIndepParam[nBsmIndepParam] = { mdl_I51x11 }; } } // end namespace mg5amcGpu/mg5amcCpu From 21cec55f98567cd26a1c425ae642122ceeca20e7 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 19:16:28 +0100 Subject: [PATCH 59/96] [susy2] fixes in CODEGEN for ee_mumu.sa (after completing the backport from susy_tt.sa) --- .../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 1c5c40adfd..23d39e5362 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -811,7 +811,7 @@ def super_write_set_parameters_onlyfixMajorana(self, hardcoded): # FIXME! split res_strings.append( prefix+" %(width)s = -abs( %(width)s );" % {"width": particle.get('width')}) if len( res_strings ) != 0 : res_strings = [ prefix + " // Fixes for Majorana particles" ] + res_strings if not hardcoded: return '\n' + '\n'.join(res_strings) if res_strings else '' - else: return '\n'.join(res_strings) + '\n' + else: return '\n' + '\n'.join(res_strings) + '\n' if res_strings else '\n' # AV - replace export_cpp.UFOModelConverterCPP method (add hardcoded parameters and couplings) def super_generate_parameters_class_files(self): @@ -819,8 +819,8 @@ def super_generate_parameters_class_files(self): # First of all, identify which extra independent parameters must be made available through CPU static and GPU constant memory in BSM models # because they are used in the event by event calculation of alphaS-dependent couplings # WARNING! This is only implemented and has only been tested so far for real parameters (complex parameters need twice the storage) + param_indep_real_used = [] if self.model_name[:2] != 'sm' : - param_indep_real_used = [] for param in self.params_indep: if param.type == 'real': for coup in self.coups_dep.values(): @@ -866,7 +866,7 @@ def super_generate_parameters_class_files(self): assert super().write_parameters([]) == '', 'super().write_parameters([]) is not empty' # AV sanity check (#622) assert self.super_write_set_parameters_donotfixMajorana([]) == '', 'super_write_set_parameters_donotfixMajorana([]) is not empty' # AV sanity check (#622) ###misc.sprint(self.params_indep) # for debugging - hrd_params_indep = [ line.replace('constexpr','//constexpr') + ' // now retrieved event-by-event (as G) from Fortran (running alphas #373)' if 'aS =' in line else line for line in self.write_hardcoded_parameters(self.params_indep,param_indep_real_used).split('\n') ] # use param_indep_real_used as deviceparams + hrd_params_indep = [ line.replace('constexpr','//constexpr') + ' // now retrieved event-by-event (as G) from Fortran (running alphas #373)' if 'aS =' in line else line for line in self.write_hardcoded_parameters(self.params_indep,param_indep_real_used).split('\n') if line != '' ] # use param_indep_real_used as deviceparams replace_dict['hardcoded_independent_parameters'] = '\n'.join( hrd_params_indep ) + self.super_write_set_parameters_onlyfixMajorana( hardcoded=True ) # add fixes for Majorana particles only in the aS-indep parameters #622 ###misc.sprint(self.coups_indep) # for debugging replace_dict['hardcoded_independent_couplings'] = self.write_hardcoded_parameters(self.coups_indep) From 314931561eb8a73c154d85ac77a408c2751b3561 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 19:24:21 +0100 Subject: [PATCH 60/96] [susy2] in CODEGEN model_handling.py, improve a variable name to indicate bsm --- .../CUDACPP_SA_OUTPUT/model_handling.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 23d39e5362..4925bcf0c2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -819,14 +819,14 @@ def super_generate_parameters_class_files(self): # First of all, identify which extra independent parameters must be made available through CPU static and GPU constant memory in BSM models # because they are used in the event by event calculation of alphaS-dependent couplings # WARNING! This is only implemented and has only been tested so far for real parameters (complex parameters need twice the storage) - param_indep_real_used = [] + bsmparam_indep_real_used = [] if self.model_name[:2] != 'sm' : for param in self.params_indep: if param.type == 'real': for coup in self.coups_dep.values(): if param.name in coup.expr: - param_indep_real_used.append( param.name ) - param_indep_real_used = set( param_indep_real_used ) + bsmparam_indep_real_used.append( param.name ) + bsmparam_indep_real_used = set( bsmparam_indep_real_used ) # Then do everything else replace_dict = self.default_replace_dict replace_dict['info_lines'] = PLUGIN_export_cpp.get_mg5_info_lines() @@ -846,8 +846,8 @@ def super_generate_parameters_class_files(self): replace_dict['set_independent_parameters'] += self.super_write_set_parameters_onlyfixMajorana( hardcoded=False ) # add fixes for Majorana particles only in the aS-indep parameters #622 if self.model_name[:2] != 'sm' : replace_dict['set_independent_parameters'] += '\n // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings;' - if len(param_indep_real_used) > 0: - for par in param_indep_real_used: + if len(bsmparam_indep_real_used) > 0: + for par in bsmparam_indep_real_used: replace_dict['set_independent_parameters'] += '\n mdl_bsmIndepParam[0] = %s;'%par else: replace_dict['set_independent_parameters'] += '\n // (none)' @@ -866,7 +866,7 @@ def super_generate_parameters_class_files(self): assert super().write_parameters([]) == '', 'super().write_parameters([]) is not empty' # AV sanity check (#622) assert self.super_write_set_parameters_donotfixMajorana([]) == '', 'super_write_set_parameters_donotfixMajorana([]) is not empty' # AV sanity check (#622) ###misc.sprint(self.params_indep) # for debugging - hrd_params_indep = [ line.replace('constexpr','//constexpr') + ' // now retrieved event-by-event (as G) from Fortran (running alphas #373)' if 'aS =' in line else line for line in self.write_hardcoded_parameters(self.params_indep,param_indep_real_used).split('\n') if line != '' ] # use param_indep_real_used as deviceparams + hrd_params_indep = [ line.replace('constexpr','//constexpr') + ' // now retrieved event-by-event (as G) from Fortran (running alphas #373)' if 'aS =' in line else line for line in self.write_hardcoded_parameters(self.params_indep,bsmparam_indep_real_used).split('\n') if line != '' ] # use bsmparam_indep_real_used as deviceparams replace_dict['hardcoded_independent_parameters'] = '\n'.join( hrd_params_indep ) + self.super_write_set_parameters_onlyfixMajorana( hardcoded=True ) # add fixes for Majorana particles only in the aS-indep parameters #622 ###misc.sprint(self.coups_indep) # for debugging replace_dict['hardcoded_independent_couplings'] = self.write_hardcoded_parameters(self.coups_indep) @@ -940,21 +940,21 @@ def super_generate_parameters_class_files(self): // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated -%s''' % ( '#define MGONGPUCPP_NBSMINDEPPARAM_GT_0 1' if len( param_indep_real_used ) > 0 else '#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0' ) +%s''' % ( '#define MGONGPUCPP_NBSMINDEPPARAM_GT_0 1' if len( bsmparam_indep_real_used ) > 0 else '#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0' ) replace_dict['bsmip0'] = ''' // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; static constexpr int nBsmIndepParam = %s; - %sdouble mdl_bsmIndepParam[nBsmIndepParam];''' % ( len( param_indep_real_used ), '' if len( param_indep_real_used ) > 0 else '//' ) + %sdouble mdl_bsmIndepParam[nBsmIndepParam];''' % ( len( bsmparam_indep_real_used ), '' if len( bsmparam_indep_real_used ) > 0 else '//' ) replace_dict['bsmip1'] = '''\n // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; constexpr int nBsmIndepParam = %s; - %s__device__ constexpr double mdl_bsmIndepParam[nBsmIndepParam]%s;''' % ( len( param_indep_real_used ), '' if len( param_indep_real_used ) > 0 else '//', ' = { %s }' % ', '.join( param_indep_real_used ) if len( param_indep_real_used ) > 0 else '' ) + %s__device__ constexpr double mdl_bsmIndepParam[nBsmIndepParam]%s;''' % ( len( bsmparam_indep_real_used ), '' if len( bsmparam_indep_real_used ) > 0 else '//', ' = { %s }' % ', '.join( bsmparam_indep_real_used ) if len( bsmparam_indep_real_used ) > 0 else '' ) replace_dict['eftwarn0'] = '\n//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=0 builds (#439 and PR #625)' replace_dict['eftwarn1'] = '\n//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=1 builds (#439 and PR #625)' - if len( param_indep_real_used ) == 0: + if len( bsmparam_indep_real_used ) == 0: replace_dict['eftspecial0'] = ' // No additional parameters needed in constant memory for this BSM model' else: - replace_dict['eftspecial0'] = '\n'.join( ' const fptype %s = bsmIndepParamPtr[%i];' % ( par, ipar ) for ipar, par in enumerate( param_indep_real_used ) ) + replace_dict['eftspecial0'] = '\n'.join( ' const fptype %s = bsmIndepParamPtr[%i];' % ( par, ipar ) for ipar, par in enumerate( bsmparam_indep_real_used ) ) replace_dict['eftspecial1'] = ' // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)' replace_dict['eftspecial1'] += '\n#if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT )' replace_dict['eftspecial2'] = """#else From df071f021e17259b13fd5d05056022b894574f55 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 19:29:39 +0100 Subject: [PATCH 61/96] [susy2] regenerate all processes (and add constexpr_math.h) - all ok, will rerun tests (and try smeft too) --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 26 +- .../SubProcesses/P1_epem_mupmum/CPPProcess.cc | 9 +- .../ee_mumu.mad/SubProcesses/cudacpp.mk | 3 + .../ee_mumu.mad/SubProcesses/testmisc.cc | 139 +++++++++++ .../cudacpp/ee_mumu.mad/src/Parameters_sm.cc | 6 +- .../cudacpp/ee_mumu.mad/src/Parameters_sm.h | 53 ++--- .../cudacpp/ee_mumu.mad/src/constexpr_math.h | 223 ++++++++++++++++++ .../cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h | 3 +- .../CODEGEN_cudacpp_ee_mumu_log.txt | 30 +-- .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc | 9 +- .../ee_mumu.sa/SubProcesses/cudacpp.mk | 3 + .../ee_mumu.sa/SubProcesses/testmisc.cc | 139 +++++++++++ .../cudacpp/ee_mumu.sa/src/Parameters_sm.cc | 6 +- epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h | 53 ++--- .../cudacpp/ee_mumu.sa/src/constexpr_math.h | 223 ++++++++++++++++++ .../cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h | 3 +- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 28 +-- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 7 +- .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 3 + .../gg_tt.mad/SubProcesses/testmisc.cc | 139 +++++++++++ epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc | 6 +- epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h | 53 ++--- epochX/cudacpp/gg_tt.mad/src/constexpr_math.h | 223 ++++++++++++++++++ epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h | 3 +- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 30 +-- .../P1_Sigma_sm_gg_ttx/CPPProcess.cc | 7 +- .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 3 + .../cudacpp/gg_tt.sa/SubProcesses/testmisc.cc | 139 +++++++++++ epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc | 6 +- epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h | 53 ++--- epochX/cudacpp/gg_tt.sa/src/constexpr_math.h | 223 ++++++++++++++++++ epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h | 3 +- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 32 +-- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 7 +- .../SubProcesses/P2_gg_ttxg/CPPProcess.cc | 7 +- .../gg_tt01g.mad/SubProcesses/cudacpp.mk | 3 + .../gg_tt01g.mad/SubProcesses/testmisc.cc | 139 +++++++++++ .../cudacpp/gg_tt01g.mad/src/Parameters_sm.cc | 6 +- .../cudacpp/gg_tt01g.mad/src/Parameters_sm.h | 53 ++--- .../cudacpp/gg_tt01g.mad/src/constexpr_math.h | 223 ++++++++++++++++++ .../cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h | 3 +- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 26 +- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 7 +- .../gg_ttg.mad/SubProcesses/cudacpp.mk | 3 + .../gg_ttg.mad/SubProcesses/testmisc.cc | 139 +++++++++++ .../cudacpp/gg_ttg.mad/src/Parameters_sm.cc | 6 +- epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h | 53 ++--- .../cudacpp/gg_ttg.mad/src/constexpr_math.h | 223 ++++++++++++++++++ .../cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h | 3 +- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 30 +-- .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc | 7 +- .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 3 + .../gg_ttg.sa/SubProcesses/testmisc.cc | 139 +++++++++++ epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc | 6 +- epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h | 53 ++--- epochX/cudacpp/gg_ttg.sa/src/constexpr_math.h | 223 ++++++++++++++++++ epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h | 3 +- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 32 +-- .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc | 7 +- .../gg_ttgg.mad/SubProcesses/cudacpp.mk | 3 + .../gg_ttgg.mad/SubProcesses/testmisc.cc | 139 +++++++++++ .../cudacpp/gg_ttgg.mad/src/Parameters_sm.cc | 6 +- .../cudacpp/gg_ttgg.mad/src/Parameters_sm.h | 53 ++--- .../cudacpp/gg_ttgg.mad/src/constexpr_math.h | 223 ++++++++++++++++++ .../cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h | 3 +- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 32 +-- .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc | 7 +- .../gg_ttgg.sa/SubProcesses/cudacpp.mk | 3 + .../gg_ttgg.sa/SubProcesses/testmisc.cc | 139 +++++++++++ .../cudacpp/gg_ttgg.sa/src/Parameters_sm.cc | 6 +- epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h | 53 ++--- .../cudacpp/gg_ttgg.sa/src/constexpr_math.h | 223 ++++++++++++++++++ .../cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h | 3 +- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 32 +-- .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc | 7 +- .../gg_ttggg.mad/SubProcesses/cudacpp.mk | 3 + .../gg_ttggg.mad/SubProcesses/testmisc.cc | 139 +++++++++++ .../cudacpp/gg_ttggg.mad/src/Parameters_sm.cc | 6 +- .../cudacpp/gg_ttggg.mad/src/Parameters_sm.h | 53 ++--- .../cudacpp/gg_ttggg.mad/src/constexpr_math.h | 223 ++++++++++++++++++ .../cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h | 3 +- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 30 +-- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc | 7 +- .../gg_ttggg.sa/SubProcesses/cudacpp.mk | 3 + .../gg_ttggg.sa/SubProcesses/testmisc.cc | 139 +++++++++++ .../cudacpp/gg_ttggg.sa/src/Parameters_sm.cc | 6 +- .../cudacpp/gg_ttggg.sa/src/Parameters_sm.h | 53 ++--- .../cudacpp/gg_ttggg.sa/src/constexpr_math.h | 223 ++++++++++++++++++ .../cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h | 3 +- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 32 +-- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 7 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 7 +- .../gq_ttq.mad/SubProcesses/cudacpp.mk | 3 + .../gq_ttq.mad/SubProcesses/testmisc.cc | 139 +++++++++++ .../cudacpp/gq_ttq.mad/src/Parameters_sm.cc | 6 +- epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h | 53 ++--- .../cudacpp/gq_ttq.mad/src/constexpr_math.h | 223 ++++++++++++++++++ .../cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h | 3 +- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 40 ++-- .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc | 7 +- .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc | 7 +- .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 3 + .../gq_ttq.sa/SubProcesses/testmisc.cc | 139 +++++++++++ epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc | 6 +- epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h | 53 ++--- epochX/cudacpp/gq_ttq.sa/src/constexpr_math.h | 223 ++++++++++++++++++ epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h | 3 +- .../CODEGEN_cudacpp_heft_gg_h_log.txt | 70 ++++-- .../P1_Sigma_heft_gg_h/CPPProcess.cc | 24 +- .../heft_gg_h.sa/SubProcesses/cudacpp.mk | 3 + .../heft_gg_h.sa/SubProcesses/testmisc.cc | 139 +++++++++++ .../heft_gg_h.sa/src/Parameters_heft.cc | 8 +- .../heft_gg_h.sa/src/Parameters_heft.h | 67 +++--- .../cudacpp/heft_gg_h.sa/src/constexpr_math.h | 223 ++++++++++++++++++ .../cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h | 3 +- .../CODEGEN_mad_pp_tt012j_log.txt | 102 ++++---- .../SubProcesses/P0_gg_ttx/CPPProcess.cc | 7 +- .../SubProcesses/P0_uux_ttx/CPPProcess.cc | 7 +- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 7 +- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 7 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 7 +- .../SubProcesses/P1_uux_ttxg/CPPProcess.cc | 7 +- .../SubProcesses/P2_gg_ttxgg/CPPProcess.cc | 7 +- .../SubProcesses/P2_gg_ttxuux/CPPProcess.cc | 7 +- .../SubProcesses/P2_gu_ttxgu/CPPProcess.cc | 7 +- .../SubProcesses/P2_gux_ttxgux/CPPProcess.cc | 7 +- .../SubProcesses/P2_uc_ttxuc/CPPProcess.cc | 7 +- .../SubProcesses/P2_ucx_ttxucx/CPPProcess.cc | 7 +- .../SubProcesses/P2_uu_ttxuu/CPPProcess.cc | 7 +- .../SubProcesses/P2_uux_ttxccx/CPPProcess.cc | 7 +- .../SubProcesses/P2_uux_ttxgg/CPPProcess.cc | 7 +- .../SubProcesses/P2_uux_ttxuux/CPPProcess.cc | 7 +- .../P2_uxcx_ttxuxcx/CPPProcess.cc | 7 +- .../P2_uxux_ttxuxux/CPPProcess.cc | 7 +- .../pp_tt012j.mad/SubProcesses/cudacpp.mk | 3 + .../pp_tt012j.mad/SubProcesses/testmisc.cc | 139 +++++++++++ .../pp_tt012j.mad/src/Parameters_sm.cc | 6 +- .../cudacpp/pp_tt012j.mad/src/Parameters_sm.h | 53 ++--- .../pp_tt012j.mad/src/constexpr_math.h | 223 ++++++++++++++++++ .../pp_tt012j.mad/src/mgOnGpuCxtypes.h | 3 +- .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 12 +- 141 files changed, 6248 insertions(+), 1023 deletions(-) create mode 100644 epochX/cudacpp/ee_mumu.mad/src/constexpr_math.h create mode 100644 epochX/cudacpp/ee_mumu.sa/src/constexpr_math.h create mode 100644 epochX/cudacpp/gg_tt.mad/src/constexpr_math.h create mode 100644 epochX/cudacpp/gg_tt.sa/src/constexpr_math.h create mode 100644 epochX/cudacpp/gg_tt01g.mad/src/constexpr_math.h create mode 100644 epochX/cudacpp/gg_ttg.mad/src/constexpr_math.h create mode 100644 epochX/cudacpp/gg_ttg.sa/src/constexpr_math.h create mode 100644 epochX/cudacpp/gg_ttgg.mad/src/constexpr_math.h create mode 100644 epochX/cudacpp/gg_ttgg.sa/src/constexpr_math.h create mode 100644 epochX/cudacpp/gg_ttggg.mad/src/constexpr_math.h create mode 100644 epochX/cudacpp/gg_ttggg.sa/src/constexpr_math.h create mode 100644 epochX/cudacpp/gq_ttq.mad/src/constexpr_math.h create mode 100644 epochX/cudacpp/gq_ttq.sa/src/constexpr_math.h create mode 100644 epochX/cudacpp/heft_gg_h.sa/src/constexpr_math.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/src/constexpr_math.h diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index c0d823893a..0142beb14a 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005708217620849609  +DEBUG: model prefixing takes 0.005445718765258789  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -164,10 +164,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_simd [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_ee_mumu INFO: remove old information in CODEGEN_mad_ee_mumu -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards  @@ -176,8 +176,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,19 +194,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.101 s +Wrote files for 8 helas calls in 0.099 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.207 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +ALOHA: aloha creates 3 routines in 0.213 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.275 s +ALOHA: aloha creates 7 routines in 0.276 s FFV1 FFV1 FFV2 @@ -243,16 +243,16 @@ patching file matrix1.f Hunk #3 succeeded at 230 (offset 9 lines). Hunk #4 succeeded at 267 (offset 18 lines). Hunk #5 succeeded at 312 (offset 18 lines). -DEBUG: p.returncode =  0 [output.py at line 241]  +DEBUG: p.returncode =  0 [output.py at line 240]  Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README Run "open index.html" to see more information about this process. quit -real 0m1.910s -user 0m1.675s -sys 0m0.221s +real 0m1.906s +user 0m1.670s +sys 0m0.218s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 053b2b91be..6b66832a3d 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -498,8 +499,8 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) ); #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; - //for ( i=0; i<3; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<3; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -610,7 +611,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -620,7 +621,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc index d976fb5bba..bac556e863 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h index ac333a365d..586ffc9955 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -191,8 +162,8 @@ namespace mg5amcCpu constexpr cxsmpl GC_59 = ( mdl_ee * mdl_complexi * mdl_sw ) / ( 2. * mdl_cw ); // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -233,16 +204,19 @@ namespace mg5amcCpu // (none) }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -285,12 +259,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); // NB: there are no aS-dependent couplings in this physics process mgDebug( 1, __FUNCTION__ ); return; diff --git a/epochX/cudacpp/ee_mumu.mad/src/constexpr_math.h b/epochX/cudacpp/ee_mumu.mad/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 71b04c8320..3e3ecb225d 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005678653717041016  +DEBUG: model prefixing takes 0.005336284637451172  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -162,29 +162,29 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 196]  -DEBUG: type(subproc_group)= [output.py at line 197]  -DEBUG: type(fortran_model)= [output.py at line 198]  -DEBUG: type(me)= me=0 [output.py at line 199]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 200]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 199]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.272 s +ALOHA: aloha creates 4 routines in 0.264 s FFV1 FFV1 FFV2 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.668s -user 0m0.613s -sys 0m0.048s -Code generation completed in 0 seconds +real 0m0.703s +user 0m0.574s +sys 0m0.064s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 728a776da5..556b827c55 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -496,8 +497,8 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) ); #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; - //for ( i=0; i<3; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<3; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -608,7 +609,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -618,7 +619,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc index d976fb5bba..bac556e863 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h index ac333a365d..586ffc9955 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -191,8 +162,8 @@ namespace mg5amcCpu constexpr cxsmpl GC_59 = ( mdl_ee * mdl_complexi * mdl_sw ) / ( 2. * mdl_cw ); // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -233,16 +204,19 @@ namespace mg5amcCpu // (none) }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -285,12 +259,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); // NB: there are no aS-dependent couplings in this physics process mgDebug( 1, __FUNCTION__ ); return; diff --git a/epochX/cudacpp/ee_mumu.sa/src/constexpr_math.h b/epochX/cudacpp/ee_mumu.sa/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 1fc03e0c34..78f6fa5a2b 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005725383758544922  +DEBUG: model prefixing takes 0.005376338958740234  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -165,10 +165,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_simd [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gg_tt INFO: remove old information in CODEGEN_mad_gg_tt -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards  @@ -177,8 +177,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,16 +194,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.103 s +Wrote files for 10 helas calls in 0.102 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.147 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +ALOHA: aloha creates 2 routines in 0.781 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.135 s +ALOHA: aloha creates 4 routines in 0.215 s VVV1 FFV1 FFV1 @@ -232,17 +232,17 @@ DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt patching file auto_dsig1.f patching file driver.f patching file matrix1.f -DEBUG: p.returncode =  0 [output.py at line 241]  +DEBUG: p.returncode =  0 [output.py at line 240]  Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README Run "open index.html" to see more information about this process. quit -real 0m1.715s -user 0m1.502s -sys 0m0.214s -Code generation completed in 2 seconds +real 0m2.602s +user 0m1.487s +sys 0m0.213s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 09a2a7b6fb..3bda469e3e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -514,7 +515,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -625,7 +626,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -635,7 +636,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc index cfedf492ab..ad143d7917 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h index 3a739b12fc..e1493c899c 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -234,16 +205,19 @@ namespace mg5amcCpu cxtype_sv GC_11; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -287,12 +261,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); diff --git a/epochX/cudacpp/gg_tt.mad/src/constexpr_math.h b/epochX/cudacpp/gg_tt.mad/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 96a207eb00..8d61954ee9 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005444765090942383  +DEBUG: model prefixing takes 0.005337953567504883  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -163,27 +163,27 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 196]  -DEBUG: type(subproc_group)= [output.py at line 197]  -DEBUG: type(fortran_model)= [output.py at line 198]  -DEBUG: type(me)= me=0 [output.py at line 199]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 200]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 199]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.146 s +ALOHA: aloha creates 2 routines in 0.144 s VVV1 FFV1 FFV1 @@ -198,7 +198,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.539s -user 0m0.476s -sys 0m0.058s -Code generation completed in 1 seconds +real 0m0.549s +user 0m0.473s +sys 0m0.052s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index 4700a044de..9931c27ce7 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -511,7 +512,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -622,7 +623,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -632,7 +633,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc index cfedf492ab..ad143d7917 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h index 3a739b12fc..e1493c899c 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -234,16 +205,19 @@ namespace mg5amcCpu cxtype_sv GC_11; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -287,12 +261,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); diff --git a/epochX/cudacpp/gg_tt.sa/src/constexpr_math.h b/epochX/cudacpp/gg_tt.sa/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 99081f6854..47a3a30985 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005807399749755859  +DEBUG: model prefixing takes 0.00570368766784668  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.009 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -173,10 +173,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_simd [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gg_tt01g INFO: remove old information in CODEGEN_mad_gg_tt01g -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards  @@ -187,8 +187,8 @@ INFO: Processing color information for process: g g > t t~ g @2 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -204,8 +204,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -221,22 +221,22 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s -Wrote files for 46 helas calls in 0.247 s +Wrote files for 46 helas calls in 0.241 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.334 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +ALOHA: aloha creates 5 routines in 0.326 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.320 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -278,16 +278,16 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 241]  +DEBUG: p.returncode =  0 [output.py at line 240]  Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README Run "open index.html" to see more information about this process. quit -real 0m2.337s -user 0m2.082s -sys 0m0.248s +real 0m2.395s +user 0m2.049s +sys 0m0.237s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 09a2a7b6fb..3bda469e3e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -514,7 +515,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -625,7 +626,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -635,7 +636,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index 244cea408f..4d3ef72615 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -734,7 +735,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -846,7 +847,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -856,7 +857,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc index f88c3b34b0..21c3ee46c1 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h index 4a0620c86e..5660a06a4d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -237,16 +208,19 @@ namespace mg5amcCpu cxtype_sv GC_12; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -291,12 +265,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); diff --git a/epochX/cudacpp/gg_tt01g.mad/src/constexpr_math.h b/epochX/cudacpp/gg_tt01g.mad/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index fa7fec111b..9cc19e9825 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005398988723754883  +DEBUG: model prefixing takes 0.005432605743408203  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -165,10 +165,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_simd [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gg_ttg INFO: remove old information in CODEGEN_mad_gg_ttg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards  @@ -177,8 +177,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -201,15 +201,15 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.338 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +ALOHA: aloha creates 5 routines in 0.327 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.332 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -247,16 +247,16 @@ Hunk #2 succeeded at 159 (offset 16 lines). Hunk #3 succeeded at 237 (offset 16 lines). Hunk #4 succeeded at 265 (offset 16 lines). Hunk #5 succeeded at 310 (offset 16 lines). -DEBUG: p.returncode =  0 [output.py at line 241]  +DEBUG: p.returncode =  0 [output.py at line 240]  Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README Run "open index.html" to see more information about this process. quit -real 0m2.239s -user 0m2.005s -sys 0m0.234s +real 0m2.223s +user 0m1.930s +sys 0m0.226s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 9d12dfe988..6611b68803 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -734,7 +735,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -846,7 +847,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -856,7 +857,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc index f88c3b34b0..21c3ee46c1 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h index 4a0620c86e..5660a06a4d 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -237,16 +208,19 @@ namespace mg5amcCpu cxtype_sv GC_12; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -291,12 +265,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); diff --git a/epochX/cudacpp/gg_ttg.mad/src/constexpr_math.h b/epochX/cudacpp/gg_ttg.mad/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index ecf3e1d46a..c69338efb0 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005639791488647461  +DEBUG: model prefixing takes 0.005579233169555664  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT @@ -163,30 +163,30 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 196]  -DEBUG: type(subproc_group)= [output.py at line 197]  -DEBUG: type(fortran_model)= [output.py at line 198]  -DEBUG: type(me)= me=0 [output.py at line 199]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 200]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 199]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.332 s +ALOHA: aloha creates 5 routines in 0.323 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.790s -user 0m0.736s -sys 0m0.050s +real 0m0.773s +user 0m0.714s +sys 0m0.051s Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 8df965ae6d..081e714ee5 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -728,7 +729,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -840,7 +841,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -850,7 +851,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc index f88c3b34b0..21c3ee46c1 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h index 4a0620c86e..5660a06a4d 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -237,16 +208,19 @@ namespace mg5amcCpu cxtype_sv GC_12; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -291,12 +265,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); diff --git a/epochX/cudacpp/gg_ttg.sa/src/constexpr_math.h b/epochX/cudacpp/gg_ttg.sa/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 11131eaf14..9294477de4 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005640506744384766  +DEBUG: model prefixing takes 0.005306720733642578  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.160 s +1 processes with 123 diagrams generated in 0.159 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -165,10 +165,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_simd [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gg_ttgg INFO: remove old information in CODEGEN_mad_gg_ttgg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards  @@ -177,8 +177,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,23 +193,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.432 s -Wrote files for 222 helas calls in 0.704 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.425 s +Wrote files for 222 helas calls in 0.687 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.337 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +ALOHA: aloha creates 5 routines in 0.327 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.322 s +ALOHA: aloha creates 10 routines in 0.309 s VVV1 VVV1 FFV1 @@ -250,17 +250,17 @@ Hunk #2 succeeded at 191 (offset 48 lines). Hunk #3 succeeded at 269 (offset 48 lines). Hunk #4 succeeded at 297 (offset 48 lines). Hunk #5 succeeded at 342 (offset 48 lines). -DEBUG: p.returncode =  0 [output.py at line 241]  +DEBUG: p.returncode =  0 [output.py at line 240]  Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README Run "open index.html" to see more information about this process. quit -real 0m3.333s -user 0m3.080s -sys 0m0.236s -Code generation completed in 3 seconds +real 0m3.366s +user 0m2.984s +sys 0m0.248s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index eef94d5587..b7e40edf31 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -2679,7 +2680,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -2792,7 +2793,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -2802,7 +2803,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc index f88c3b34b0..21c3ee46c1 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h index 4a0620c86e..5660a06a4d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -237,16 +208,19 @@ namespace mg5amcCpu cxtype_sv GC_12; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -291,12 +265,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); diff --git a/epochX/cudacpp/gg_ttgg.mad/src/constexpr_math.h b/epochX/cudacpp/gg_ttgg.mad/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 38a3c3a518..f2e8e57a44 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00548243522644043  +DEBUG: model prefixing takes 0.005572795867919922  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.162 s +1 processes with 123 diagrams generated in 0.159 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -163,30 +163,30 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 196]  -DEBUG: type(subproc_group)= [output.py at line 197]  -DEBUG: type(fortran_model)= [output.py at line 198]  -DEBUG: type(me)= me=0 [output.py at line 199]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 200]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 199]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.435 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.326 s +ALOHA: aloha creates 5 routines in 0.315 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.468s -user 0m1.398s -sys 0m0.062s -Code generation completed in 1 seconds +real 0m1.522s +user 0m1.362s +sys 0m0.061s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index bb8f6a9899..e6eb707fa4 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -2736,7 +2737,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -2849,7 +2850,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -2859,7 +2860,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc index f88c3b34b0..21c3ee46c1 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h index 4a0620c86e..5660a06a4d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -237,16 +208,19 @@ namespace mg5amcCpu cxtype_sv GC_12; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -291,12 +265,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); diff --git a/epochX/cudacpp/gg_ttgg.sa/src/constexpr_math.h b/epochX/cudacpp/gg_ttgg.sa/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index e8b21a0952..79e09690a5 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005578756332397461  +DEBUG: model prefixing takes 0.005720376968383789  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.919 s +1 processes with 1240 diagrams generated in 1.873 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -165,10 +165,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_simd [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gg_ttggg INFO: remove old information in CODEGEN_mad_gg_ttggg -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards  @@ -179,8 +179,8 @@ INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -195,23 +195,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.694 s -Wrote files for 2281 helas calls in 18.830 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.567 s +Wrote files for 2281 helas calls in 18.534 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.326 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +ALOHA: aloha creates 5 routines in 0.314 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.320 s +ALOHA: aloha creates 10 routines in 0.308 s VVV1 VVV1 FFV1 @@ -252,17 +252,17 @@ Hunk #2 succeeded at 255 (offset 112 lines). Hunk #3 succeeded at 333 (offset 112 lines). Hunk #4 succeeded at 361 (offset 112 lines). Hunk #5 succeeded at 406 (offset 112 lines). -DEBUG: p.returncode =  0 [output.py at line 241]  +DEBUG: p.returncode =  0 [output.py at line 240]  Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README Run "open index.html" to see more information about this process. quit -real 0m30.669s -user 0m29.221s -sys 0m0.403s -Code generation completed in 31 seconds +real 0m29.191s +user 0m28.652s +sys 0m0.431s +Code generation completed in 29 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index b883c550b2..ed2ea4ce28 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -30345,7 +30346,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -30459,7 +30460,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -30469,7 +30470,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc index f88c3b34b0..21c3ee46c1 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h index 4a0620c86e..5660a06a4d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -237,16 +208,19 @@ namespace mg5amcCpu cxtype_sv GC_12; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -291,12 +265,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); diff --git a/epochX/cudacpp/gg_ttggg.mad/src/constexpr_math.h b/epochX/cudacpp/gg_ttggg.mad/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 6d60b544b0..b33a603bdb 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005746364593505859  +DEBUG: model prefixing takes 0.0054645538330078125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.934 s +1 processes with 1240 diagrams generated in 1.881 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -163,30 +163,30 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Processing color information for process: g g > t t~ g g g @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 196]  -DEBUG: type(subproc_group)= [output.py at line 197]  -DEBUG: type(fortran_model)= [output.py at line 198]  -DEBUG: type(me)= me=0 [output.py at line 199]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 200]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 199]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.709 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.597 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.354 s +ALOHA: aloha creates 5 routines in 0.345 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m13.375s -user 0m13.203s -sys 0m0.119s +real 0m13.328s +user 0m12.880s +sys 0m0.107s Code generation completed in 13 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index c6b0dfe8b7..fd7d52e24c 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -32235,7 +32236,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -32349,7 +32350,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -32359,7 +32360,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc index f88c3b34b0..21c3ee46c1 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h index 4a0620c86e..5660a06a4d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -237,16 +208,19 @@ namespace mg5amcCpu cxtype_sv GC_12; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -291,12 +265,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); diff --git a/epochX/cudacpp/gg_ttggg.sa/src/constexpr_math.h b/epochX/cudacpp/gg_ttggg.sa/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 7015773962..654a104b7a 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005769014358520508  +DEBUG: model prefixing takes 0.00540471076965332  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.079 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -180,10 +180,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_simd [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_gq_ttq INFO: remove old information in CODEGEN_mad_gq_ttq -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards  @@ -200,8 +200,8 @@ INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -217,8 +217,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -234,16 +234,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s -Wrote files for 32 helas calls in 0.223 s +Wrote files for 32 helas calls in 0.217 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.147 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +ALOHA: aloha creates 2 routines in 0.151 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.135 s +ALOHA: aloha creates 4 routines in 0.131 s FFV1 FFV1 FFV1 @@ -289,16 +289,16 @@ Hunk #2 succeeded at 162 (offset 19 lines). Hunk #3 succeeded at 247 (offset 26 lines). Hunk #4 succeeded at 281 (offset 32 lines). Hunk #5 succeeded at 326 (offset 32 lines). -DEBUG: p.returncode =  0 [output.py at line 241]  +DEBUG: p.returncode =  0 [output.py at line 240]  Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README Run "open index.html" to see more information about this process. quit -real 0m1.956s -user 0m1.706s -sys 0m0.245s +real 0m1.937s +user 0m1.715s +sys 0m0.200s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index bf2e37aa30..00c62e5820 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -570,7 +571,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -682,7 +683,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -692,7 +693,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index aac21c1530..5006ce7768 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -570,7 +571,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -682,7 +683,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -692,7 +693,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc index 995560d289..05be4ea41c 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h index 72697d1533..00b35be0b5 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -234,16 +205,19 @@ namespace mg5amcCpu cxtype_sv GC_10; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -287,12 +261,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); diff --git a/epochX/cudacpp/gq_ttq.mad/src/constexpr_math.h b/epochX/cudacpp/gq_ttq.mad/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 8ed0b3a1c7..3451aaba00 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005808115005493164  +DEBUG: model prefixing takes 0.005368471145629883  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.084 s +8 processes with 40 diagrams generated in 0.083 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -178,8 +178,8 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 @@ -192,30 +192,30 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 196]  -DEBUG: type(subproc_group)= [output.py at line 197]  -DEBUG: type(fortran_model)= [output.py at line 198]  -DEBUG: type(me)= me=0 [output.py at line 199]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 200]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 199]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 196]  -DEBUG: type(subproc_group)= [output.py at line 197]  -DEBUG: type(fortran_model)= [output.py at line 198]  -DEBUG: type(me)= me=1 [output.py at line 199]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 200]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=1 [output.py at line 198]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 199]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.203 s +ALOHA: aloha creates 2 routines in 0.144 s FFV1 FFV1 FFV1 @@ -231,7 +231,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m1.010s -user 0m0.617s -sys 0m0.055s +real 0m0.659s +user 0m0.579s +sys 0m0.068s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index 19e98a1b95..e82385b24d 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -565,7 +566,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -677,7 +678,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -687,7 +688,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index 3b95bff2b8..0fad1abce3 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -565,7 +566,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -677,7 +678,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -687,7 +688,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc index 995560d289..05be4ea41c 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h index 72697d1533..00b35be0b5 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -234,16 +205,19 @@ namespace mg5amcCpu cxtype_sv GC_10; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -287,12 +261,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); diff --git a/epochX/cudacpp/gq_ttq.sa/src/constexpr_math.h b/epochX/cudacpp/gq_ttq.sa/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index ef394b2a87..5af82fa96a 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -62,12 +62,54 @@ set auto_convert_model T save options auto_convert_model save configuration file to /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft +INFO: download model from http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz to the following directory: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/models  +--2024-02-29 19:27:58-- http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz +Resolving madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)... 130.104.1.243 +Connecting to madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)|130.104.1.243|:80... connected. +HTTP request sent, awaiting response... 200 OK +Length: 50876 (50K) [application/x-gzip] +Saving to: ‘tmp.tgz’ + + 0K .......... .......... .......... .......... ......... 100% 1.03M=0.05s + +2024-02-29 19:27:58 (1.03 MB/s) - ‘tmp.tgz’ saved [50876/50876] + +heft/ +heft/write_param_card.py +heft/restrict_ckm.dat +heft/couplings.py +heft/HEFT_UFO.log +heft/lorentz.py +heft/__init__.py +heft/__pycache__/ +heft/particles.py +heft/object_library.py +heft/restrict_default.dat +heft/restrict_zeromass_ckm.dat +heft/restrict_no_b_mass.dat +heft/function_library.py +heft/parameters.py +heft/py3_model.pkl +heft/coupling_orders.py +heft/restrict_no_tau_mass.dat +heft/vertices.py +heft/restrict_no_masses.dat +heft/__pycache__/write_param_card.cpython-311.pyc +heft/__pycache__/parameters.cpython-311.pyc +heft/__pycache__/function_library.cpython-311.pyc +heft/__pycache__/coupling_orders.cpython-311.pyc +heft/__pycache__/object_library.cpython-311.pyc +heft/__pycache__/couplings.cpython-311.pyc +heft/__pycache__/particles.cpython-311.pyc +heft/__pycache__/vertices.cpython-311.pyc +heft/__pycache__/lorentz.cpython-311.pyc +heft/__pycache__/__init__.cpython-311.pyc INFO: reload from .py file INFO: load particles INFO: load vertices WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -DEBUG: model prefixing takes 0.005836963653564453  +DEBUG: model prefixing takes 0.006208658218383789  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -143,26 +185,26 @@ Load PLUGIN.CUDACPP_OUTPUT It has been validated for the last time with version: 3.5.2 Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  plugin [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 INFO: Processing color information for process: g g > h HIG<=1 HIW<=1 @1 -DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 196]  -DEBUG: type(subproc_group)= [output.py at line 197]  -DEBUG: type(fortran_model)= [output.py at line 198]  -DEBUG: type(me)= me=0 [output.py at line 199]  -DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 200]  +DEBUG: Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [output.py at line 195]  +DEBUG: type(subproc_group)= [output.py at line 196]  +DEBUG: type(fortran_model)= [output.py at line 197]  +DEBUG: type(me)= me=0 [output.py at line 198]  +DEBUG: "need to link", self.to_link_in_P =  need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [output.py at line 199]  INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.063 s +ALOHA: aloha creates 1 routines in 0.071 s VVS3 FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. @@ -174,7 +216,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. quit -real 0m0.451s -user 0m0.390s -sys 0m0.052s -Code generation completed in 0 seconds +real 0m1.586s +user 0m0.442s +sys 0m0.073s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc index aca0e78235..319ca3e12c 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -86,6 +87,20 @@ namespace mg5amcCpu //static fptype* cIPD = nullptr; // unused as nparam=0 static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) @@ -463,10 +478,15 @@ namespace mg5amcCpu #ifdef MGONGPUCPP_GPUIMPL //gpuMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0 //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 + if( Parameters_MSSM_SLHA2::nBsmIndepParam > 0 ) + gpuMemcpyToSymbol( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) ); #else //memcpy( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0 //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 + if( Parameters_MSSM_SLHA2::nBsmIndepParam > 0 ) + memcpy( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) ); #endif + //for ( int i=0; imdl_bsmIndepParam[i] = " << m_pars->mdl_bsmIndepParam[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -576,7 +596,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -586,7 +606,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc index 8b23599111..7a9cdb8cd3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc @@ -40,9 +40,9 @@ Parameters_heft::getInstance() void Parameters_heft::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH1 = slha.get_block_entry( "decay", 9000006, 6.382339e-03 ); mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); @@ -102,6 +102,8 @@ Parameters_heft::setIndependentParameters( SLHAReader& slha ) mdl_gw__exp__2 = ( ( mdl_gw ) * ( mdl_gw ) ); mdl_cw__exp__2 = ( ( mdl_cw ) * ( mdl_cw ) ); mdl_sw__exp__2 = ( ( mdl_sw ) * ( mdl_sw ) ); + // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; + // (none) } void diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h index 28297e6be4..9ee9c417f2 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h @@ -20,10 +20,16 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) -#error This non-SM physics process only supports MGONGPU_HARDCODE_PARAM builds (#439): please run "make HRDCOD=1" +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + +#ifndef MGONGPU_HARDCODE_PARAM +//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=0 builds (#439 and PR #625) #include "read_slha.h" @@ -80,6 +86,9 @@ namespace mg5amcCpu // Print couplings that are changed event by event //void printDependentCouplings(); // now computed event-by-event (running alphas #373) + // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; + static constexpr int nBsmIndepParam = 0; + //double mdl_bsmIndepParam[nBsmIndepParam]; private: @@ -89,6 +98,7 @@ namespace mg5amcCpu } // end namespace mg5amcGpu/mg5amcCpu #else +//#warning Support for non-SM physics processes (e.g. SUSY or EFT) is still limited for HRDCOD=1 builds (#439 and PR #625) #include #include @@ -103,37 +113,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_heft // keep the same name rather than HardcodedParameters_heft for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -201,8 +180,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) //constexpr double mdl_GH = -( mdl_G__exp__2 * ( 1. + ( 13. * mdl_MH__exp__6 ) / ( 16800. * mdl_MT__exp__6 ) + mdl_MH__exp__4 / ( 168. * mdl_MT__exp__4 ) + ( 7. * mdl_MH__exp__2 ) / ( 120. * mdl_MT__exp__2 ) ) ) / ( 12. * ( ( M_PI ) * ( M_PI ) ) * mdl_v ); // now computed event-by-event (running alphas #373) //constexpr double mdl_Gphi = -( mdl_G__exp__2 * ( 1. + mdl_MH__exp__6 / ( 560. * mdl_MT__exp__6 ) + mdl_MH__exp__4 / ( 90. * mdl_MT__exp__4 ) + mdl_MH__exp__2 / ( 12. * mdl_MT__exp__2 ) ) ) / ( 8. * ( ( M_PI ) * ( M_PI ) ) * mdl_v ); // now computed event-by-event (running alphas #373) @@ -221,6 +200,10 @@ namespace mg5amcCpu // Print couplings that are changed event by event //void printDependentCouplings(); // now computed event-by-event (running alphas #373) + + // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; + constexpr int nBsmIndepParam = 0; + //__device__ constexpr double mdl_bsmIndepParam[nBsmIndepParam]; } } // end namespace mg5amcGpu/mg5amcCpu @@ -245,16 +228,19 @@ namespace mg5amcCpu cxtype_sv GC_13; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_heft; +#else + // No additional parameters needed in constant memory for this BSM model #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_heft) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -322,12 +308,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_heft_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_13s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_13 ); cxtype_sv_ref GC_13s_sv = C_ACCESS::kernelAccess( GC_13s ); GC_13s_sv = couplings_sv.GC_13; diff --git a/epochX/cudacpp/heft_gg_h.sa/src/constexpr_math.h b/epochX/cudacpp/heft_gg_h.sa/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index fa869aa432..e2da496917 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00567317008972168  +DEBUG: model prefixing takes 0.00541996955871582  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.141 s +13 processes with 76 diagrams generated in 0.137 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.880 s +65 processes with 1119 diagrams generated in 1.831 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -388,10 +388,10 @@ It has been validated for the last time with version: 3.5.2 Addition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT Output will be done with PLUGIN: CUDACPP_OUTPUT DEBUG: cformat =  standalone_simd [export_cpp.py at line 3071]  -DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 162]  +DEBUG: Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [output.py at line 161]  INFO: initialize a new directory: CODEGEN_mad_pp_tt012j INFO: remove old information in CODEGEN_mad_pp_tt012j -DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 167]  +DEBUG: Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [output.py at line 166]  WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j  INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j WARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards  @@ -499,8 +499,8 @@ INFO: Combined process c c~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -516,8 +516,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -533,8 +533,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -550,8 +550,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -567,8 +567,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -584,8 +584,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -601,8 +601,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -618,8 +618,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -635,8 +635,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -652,8 +652,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -669,8 +669,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -686,8 +686,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -703,8 +703,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -720,8 +720,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -737,8 +737,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -754,8 +754,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -771,8 +771,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -788,8 +788,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1057]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -804,23 +804,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.306 s -Wrote files for 810 helas calls in 3.324 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.288 s +Wrote files for 810 helas calls in 3.872 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.340 s -DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 205]  +ALOHA: aloha creates 5 routines in 0.334 s +DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.317 s +ALOHA: aloha creates 10 routines in 0.318 s VVV1 VVV1 FFV1 @@ -1023,17 +1023,17 @@ Hunk #2 succeeded at 194 (offset 51 lines). Hunk #3 succeeded at 272 (offset 51 lines). Hunk #4 succeeded at 300 (offset 51 lines). Hunk #5 succeeded at 345 (offset 51 lines). -DEBUG: p.returncode =  0 [output.py at line 241]  +DEBUG: p.returncode =  0 [output.py at line 240]  Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done. Type "launch" to generate events from this process, or see /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README Run "open index.html" to see more information about this process. quit -real 0m10.739s -user 0m8.492s -sys 0m0.454s -Code generation completed in 11 seconds +real 0m9.475s +user 0m8.365s +sys 0m0.419s +Code generation completed in 10 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index f460137979..f2e34f3cc4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -514,7 +515,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -625,7 +626,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -635,7 +636,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index e7ea5ac849..87e2241a5e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -491,7 +492,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -602,7 +603,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -612,7 +613,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 9d12dfe988..6611b68803 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -734,7 +735,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -846,7 +847,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -856,7 +857,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 533950cdc5..61576da03d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -570,7 +571,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -682,7 +683,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -692,7 +693,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index aff905a9c3..ec8a171811 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -570,7 +571,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -682,7 +683,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -692,7 +693,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index 31821b9236..1df0359166 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -570,7 +571,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -682,7 +683,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -692,7 +693,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index 5b259c177e..63dc41ba9c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -2679,7 +2680,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -2792,7 +2793,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -2802,7 +2803,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index c8c43987c5..b0462c7741 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1074,7 +1075,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -1187,7 +1188,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index 5bd91aafa9..44603af8fa 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1074,7 +1075,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -1187,7 +1188,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index 5793f4525f..76494fea01 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1074,7 +1075,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -1187,7 +1188,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index f938395fee..9e0123b38e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -649,7 +650,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -762,7 +763,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -772,7 +773,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index 263463b7a9..3adb4e237d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -655,7 +656,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -768,7 +769,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -778,7 +779,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index 0a075beaac..b3a3804c9d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -759,7 +760,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -872,7 +873,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -882,7 +883,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index 3408b49036..4dc65ee56c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -655,7 +656,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -768,7 +769,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -778,7 +779,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 46c044c861..343d0f7ef7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1074,7 +1075,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -1187,7 +1188,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index 2180832b3b..b2b189eeea 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -759,7 +760,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -872,7 +873,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -882,7 +883,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index 00fe64d5b1..9321d65fe4 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -649,7 +650,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -762,7 +763,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -772,7 +773,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index fbeb9bcb41..073e575c03 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -759,7 +760,7 @@ namespace mg5amcCpu memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #endif - //for ( i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; + //for ( int i=0; i<2; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl; } #else // Initialize process (with hardcoded parameters) @@ -872,7 +873,7 @@ namespace mg5amcCpu using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; - G2COUP( allgs, allcouplings ); + G2COUP( allgs, allcouplings, bsmIndepParam ); #else using namespace mg5amcCpu; using G_ACCESS = HostAccessGs; @@ -882,7 +883,7 @@ namespace mg5amcCpu const int ievt0 = ipagV * neppV; const fptype* gs = MemoryAccessGs::ieventAccessRecordConst( allgs, ievt0 ); fptype* couplings = MemoryAccessCouplings::ieventAccessRecord( allcouplings, ievt0 ); - G2COUP( gs, couplings ); + G2COUP( gs, couplings, bsmIndepParam ); } #endif } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index 3ad91dfd59..f7a61d3e74 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -847,6 +847,9 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Test quadmath in testmisc.cc tests for constexpr_math #627 +###$(testmain): LIBFLAGS += -lquadmath + # Bypass std::filesystem completely to ease portability on LUMI #803 #ifneq ($(findstring hipcc,$(GPUCC)),) #$(testmain): LIBFLAGS += -lstdc++fs diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc index ac0b049e60..8c29482e5a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc @@ -10,10 +10,14 @@ #include "mgOnGpuVectors.h" +#include "constexpr_math.h" #include "epoch_process_id.h" #include +//#include +//#include // needs C++20... https://stackoverflow.com/a/65347016 +#include #include #include @@ -295,4 +299,139 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) } //-------------------------------------------------------------------------- + + // Test constexpr floor + EXPECT_TRUE( constexpr_floor( 1.5 ) == 1 ); + EXPECT_TRUE( constexpr_floor( 0.5 ) == 0 ); + EXPECT_TRUE( constexpr_floor( -0.5 ) == -1 ); + EXPECT_TRUE( constexpr_floor( -1.5 ) == -2 ); + + // Distance from the horizontal or vertical axis (i.e. from 0, pi/2, pi, or 3pi/2) + auto distance4 = []( const long double xx ) + { + const long double xx2 = mapIn0to2Pi( xx ); // in [0,2*pi) + const long double xx3 = xx2 - constexpr_floor( xx2 / constexpr_pi_by_2 ) * constexpr_pi_by_2; // in [0,pi/2) + const long double d0 = xx3; // distance from 0 + const long double d1 = constexpr_pi_by_2 - xx3; // distance from pi/2 + return ( d0 < d1 ? d0 : d1 ); + }; + + // Test constexpr sin, cos, tan - specific, problematic, points + auto testSinCosTanX = []( const long double xx, const double tolerance, const bool debug = false, const long long istep = -999999999 ) + { + const double x = (double)xx; + if( debug ) + { + //std::cout << std::setprecision(40) << "testSinCosTanX: xx= " << xx << std::endl; + //std::cout << std::setprecision(40) << " x= " << x << std::endl; + } + //std::cout << std::setprecision(40) << "xx - 3pi/2 " << xx - 3 * constexpr_pi_by_2 << std::endl; + //int width = 46; + //char buf[128]; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)xx ); + //std::cout << std::setprecision(40) << "testSinCosTanX: xx=" << buf << std::endl; + //quadmath_snprintf( buf, sizeof( buf ), "%+-#*.40Qe", width, (__float128)x ); + //std::cout << std::setprecision(40) << " x= " << buf << std::endl; + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::abs( std::sin( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::abs( std::cos( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::abs( std::tan( x ) * tolerance ) ) + << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; + std::cout << std::setprecision( 6 ); // default + }; + testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h + testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) + testSinCosTanX( 3 * constexpr_pi_by_2 - 1.9601e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) + + // Test constexpr sin, cos, tan - 8 points on (or close to) the boundaries of the 8 sectors of [0,2*pi] + auto testSinCosTan8 = [testSinCosTanX]( const double deltax, const double tolerance ) + { + for( int ioff = -1; ioff < 2; ioff++, ioff++ ) // -1, 1 + { + const bool debug = false; + const int nstep = 8; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = deltax * ioff; + long double x1 = deltax * ioff + 2 * constexpr_pi; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + testSinCosTanX( x, tolerance, debug, istep ); + } + } + }; + + // Use much lower tolerance when testing on the boundaries of the 8 sectors of [0,2*pi] + // Use progressively stricter tolerances as you move away from the boundaries of the 8 sectors of [0,2*pi] + testSinCosTan8( 0, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-15, 1E-03 ); // fails with 1E-04 - DANGEROUS ANYWAY... + testSinCosTan8( 1E-14, 1E-04 ); // fails with 1E-05 + testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 + testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 + testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 + testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 + testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + + // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance + auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) + { + auto toleranceForX = [distance4]( const double x ) + { + const double d4 = distance4( x ); + if( d4 < 1E-14 ) + return 1E-03; // NB: absolute distance limited to 1E-14 anyway even if relative tolerance is 1E-3... + else if( d4 < 1E-13 ) + return 1E-04; + else if( d4 < 1E-12 ) + return 1E-05; + else if( d4 < 1E-11 ) + return 1E-06; + else if( d4 < 1E-10 ) + return 1E-07; + else if( d4 < 1E-09 ) + return 1E-08; + else if( d4 < 1E-08 ) + return 1E-09; + else if( d4 < 1E-07 ) + return 1E-10; + else if( d4 < 1E-06 ) + return 1E-11; + else if( d4 < 1E-05 ) + return 1E-12; + else if( d4 < 1E-04 ) + return 1E-13; + else + return 1E-14; // play it safe even if the agreement might even be better? + }; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + const double tolerance = toleranceForX( x ); + EXPECT_NEAR( std::sin( x ), constexpr_sin( x ), std::max( std::abs( std::sin( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::cos( x ), constexpr_cos( x ), std::max( std::abs( std::cos( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + EXPECT_NEAR( std::tan( x ), constexpr_tan( x ), std::max( std::abs( std::tan( x ) * tolerance ), 3E-15 ) ) + << std::setprecision( 40 ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ",\n istep=" << istep << ", distance4=" << distance4( x ); + } + }; + testSinCosTanN( 100, -4 * constexpr_pi, 6 * constexpr_pi ); // this was failing at 3*pi/2 (now fixed by absolute tolerance 3E-15) + testSinCosTanN( 10000, -constexpr_pi_by_2, 5 * constexpr_pi_by_2 ); + + // Test constexpr atan + { + const double tolerance = 1E-12; + const int nstep = 1000; + for( int istep = 0; istep < nstep + 1; istep++ ) + { + long double x0 = -5, x1 = +5; + double x = x0 + istep * ( x1 - x0 ) / nstep; // test this for double (else std::cos and std::sin use long double) + EXPECT_NEAR( std::atan( x ), constexpr_atan( x ), std::abs( std::atan( x ) * tolerance ) ) + << "x=" << x << ", istep=" << istep; + } + } + + //-------------------------------------------------------------------------- } diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc index f88c3b34b0..21c3ee46c1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc @@ -40,9 +40,9 @@ Parameters_sm::getInstance() void Parameters_sm::setIndependentParameters( SLHAReader& slha ) { - zero = 0; // define "zero" - ZERO = 0; // define "zero" - //std::vector indices(2, 0); // prepare a vector for indices + zero = 0; // define "zero" + ZERO = 0; // define "zero" + std::vector indices( 2, 0 ); // prepare a vector for indices mdl_WH = slha.get_block_entry( "decay", 25, 6.382339e-03 ); mdl_WW = slha.get_block_entry( "decay", 24, 2.047600e+00 ); mdl_WZ = slha.get_block_entry( "decay", 23, 2.441404e+00 ); diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h index 4a0620c86e..5660a06a4d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h @@ -20,9 +20,11 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuVectors.h" +#include "constexpr_math.h" + //========================================================================== -#ifndef MGONGPU_HARDCODE_PARAM // this is only supported in SM processes (e.g. not in EFT models) for the moment (#439) +#ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" @@ -102,37 +104,6 @@ namespace mg5amcCpu // Hardcoded constexpr physics parameters namespace Parameters_sm // keep the same name rather than HardcodedParameters_sm for simplicity { - // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) - double constexpr sqrtNewtonRaphson( double x, double curr, double prev ) - { - return curr == prev ? curr : sqrtNewtonRaphson( x, 0.5 * ( curr + x / curr ), curr ); - } - double constexpr constexpr_sqrt( double x ) - { - return x >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math - ? sqrtNewtonRaphson( x, x, 0 ) - : std::numeric_limits::quiet_NaN(); - } - - // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) - constexpr int constexpr_floor( double d ) - { - const int i = static_cast( d ); - return d < i ? i - 1 : i; - } - - // Constexpr implementation of pow - constexpr double constexpr_pow( double base, double exp ) - { - // NB(1): this implementation of constexpr_pow requires exponent >= 0 - assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // NB(2): this implementation of constexpr_pow requires an integer exponent - const int iexp = constexpr_floor( exp ); - assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" - // Iterative implementation of pow if exp is a non negative integer - return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); - } - // Model parameters independent of aS constexpr double zero = 0; constexpr double ZERO = 0; @@ -189,8 +160,8 @@ namespace mg5amcCpu // (none) // Model parameters dependent on aS - //constexpr double mdl_sqrt__aS = //constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) - //constexpr double G = 2. * mdl_sqrt__aS * //constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) + //constexpr double mdl_sqrt__aS = constexpr_sqrt( aS ); // now computed event-by-event (running alphas #373) + //constexpr double G = 2. * mdl_sqrt__aS * constexpr_sqrt( M_PI ); // now computed event-by-event (running alphas #373) //constexpr double mdl_G__exp__2 = ( ( G ) * ( G ) ); // now computed event-by-event (running alphas #373) // Model couplings dependent on aS @@ -237,16 +208,19 @@ namespace mg5amcCpu cxtype_sv GC_12; }; #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> -#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; +#else + // SM implementation - no special handling of non-hardcoded parameters (PR #625) #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_sm) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -291,12 +265,13 @@ namespace mg5amcCpu template __device__ inline void G2COUP( const fptype gs[], - fptype couplings[] ) + fptype couplings[], + const fptype* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv, bsmIndepParamPtr ); fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); fptype* GC_12s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_12 ); diff --git a/epochX/cudacpp/pp_tt012j.mad/src/constexpr_math.h b/epochX/cudacpp/pp_tt012j.mad/src/constexpr_math.h new file mode 100644 index 0000000000..78ff8b16ab --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/src/constexpr_math.h @@ -0,0 +1,223 @@ +// Copyright (C) 2020-2024 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: A. Valassi (Feb 2024) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2024) for the MG5aMC CUDACPP plugin. + +#ifndef constexpr_math_h +#define constexpr_math_h 1 + +#include "mgOnGpuConfig.h" + +#include +#include +#include + +// FOR DEBUGGING! +#undef CONSTEXPR_MATH_DEBUG // no-debug +//#define CONSTEXPR_MATH_DEBUG 1 // debug +#ifdef CONSTEXPR_MATH_DEBUG +#define constexpr const +#endif + +// NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ + // Constexpr implementation of sqrt (see https://stackoverflow.com/a/34134071) + constexpr long double sqrtNewtonRaphson( const long double xx, const long double curr, const long double prev ) + { + return curr == prev ? curr : sqrtNewtonRaphson( xx, 0.5 * ( curr + xx / curr ), curr ); + } + constexpr long double constexpr_sqrt( const long double xx ) + { + return xx >= 0 // && x < std::numeric_limits::infinity() // avoid -Wtautological-constant-compare warning in fast math + ? sqrtNewtonRaphson( xx, xx, 0 ) + : std::numeric_limits::quiet_NaN(); + } + + // Constexpr implementation of floor (see https://stackoverflow.com/a/66146159) + constexpr int constexpr_floor( const long double xx ) + { + const int i = static_cast( xx ); + return xx < i ? i - 1 : i; + } + + // Constexpr implementation of pow + constexpr long double constexpr_pow( const long double base, const long double exp ) + { + // NB(1): this implementation of constexpr_pow requires exponent >= 0 + assert( exp >= 0 ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // NB(2): this implementation of constexpr_pow requires an integer exponent + const int iexp = constexpr_floor( exp ); + assert( static_cast( iexp ) == exp ); // NB would fail at compile time with "error: call to non-‘constexpr’ function ‘void __assert_fail'" + // Iterative implementation of pow if exp is a non negative integer + return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); + } + + // PI from cmath + constexpr long double constexpr_pi = M_PIl; // pi + constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + + // Constexpr implementation of sin for 0= 0 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + assert( xx < constexpr_pi_by_4 && "The argument of sinTaylor is assumed to be in [0,pi/4)" ); + long double sinx = 0; + int ipow = 1; + long double delta = xx; + while( true ) + { + long double sinxlast = sinx; + sinx += delta; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", delta=" << delta << ", sinx=" << sinx << std::endl; // for debugging (not constexpr) +#endif + if( sinx == sinxlast ) break; + // Next iteration + ipow += 2; + delta *= -xx * xx / ( ipow - 1 ) / ipow; + } + return sinx; + } + + // Mapping to [0,2*pi) range (long double signature) + constexpr long double mapIn0to2Pi( const long double xx ) + { + return xx - constexpr_floor( xx / 2 / constexpr_pi ) * 2 * constexpr_pi; + } + + // Constexpr implementation of cos (long double signature) + constexpr long double constexpr_cos_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_cos_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx ), 2 ) ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return sinTaylor( constexpr_pi_by_2 - xx ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return -sinTaylor( xx - constexpr_pi_by_2 ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return -constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi - xx ), 2 ) ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return constexpr_cos_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_cos_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of cos (double signature, internally implemented as long double) + constexpr double constexpr_cos( const double x ) + { + return constexpr_cos_quad( x ); + } + + // Constexpr implementation of sin (long double signature) + constexpr long double constexpr_sin_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + else if( xx < constexpr_pi_by_4 ) // [0/4*pi, 1/4*pi) + return sinTaylor( xx ); + else if( xx < constexpr_pi_by_2 ) // [1/4*pi, 2/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( constexpr_pi_by_2 - xx ), 2 ) ); + else if( xx < 3 * constexpr_pi_by_4 ) // [2/4*pi, 3/4*pi) + return constexpr_sqrt( 1 - constexpr_pow( sinTaylor( xx - constexpr_pi_by_2 ), 2 ) ); + else if( xx < constexpr_pi ) // [3/4*pi, 4/4*pi) + return sinTaylor( constexpr_pi - xx ); + else if( xx < 2 * constexpr_pi ) // [4/4*pi, 8/4*pi) + return -constexpr_sin_quad( 2 * constexpr_pi - xx, true ); + else // [8/4*pi, +inf) + return constexpr_sin_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of sin (double signature, internally implemented as long double) + constexpr double constexpr_sin( const double x ) + { + return constexpr_sin_quad( x ); + } + + // Constexpr implementation of tan (long double signature) + constexpr long double constexpr_tan_quad( const long double xx, const bool assume0to2Pi = false ) + { + if( assume0to2Pi ) + { + assert( xx >= 0 && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + assert( xx < 2 * constexpr_pi && "The argument of constexpr_sin_quad is assumed to be in [0,2*pi)" ); + } + if( xx < 0 ) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + else if( xx < 2 * constexpr_pi ) // [0, 2*pi) + return constexpr_sin_quad( xx, assume0to2Pi ) / constexpr_cos_quad( xx, assume0to2Pi ); + else // [8/4*pi, +inf) + return constexpr_tan_quad( mapIn0to2Pi( xx ), true ); + } + + // Constexpr implementation of tan (double signature, internally implemented as long double) + constexpr double constexpr_tan( const double x ) + { + return constexpr_tan_quad( x ); + } + + // Constexpr implementation of atan for -1= -1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + assert( xx < 1 && "The argument of atanTaylor is assumed to be in (-1,+1)" ); + long double atanx = 0; + int ipow = 1; + long double xpow = xx; + while( true ) + { + long double atanxlast = atanx; + atanx += xpow / ipow; +#ifdef CONSTEXPR_MATH_DEBUG + std::cout << "ipow=" << ipow << ", xpow=" << xpow << ", atanx=" << atanx << std::endl; // for debugging (not constexpr) +#endif + if( atanx == atanxlast ) break; + // Next iteration + ipow += 2; + xpow *= -xx * xx; + } + return atanx; + } + + // Constexpr implementation of atan (long double signature) + constexpr long double constexpr_atan_quad( const long double xx ) + { + if( xx > 1 ) + return constexpr_pi_by_2 - atanTaylor( 1 / xx ); + else if( xx == 1 ) + return constexpr_pi_by_4; + else if( xx > -1 ) + return atanTaylor( xx ); + else if( xx == -1 ) + return -constexpr_pi_by_4; + else // if( xx < -1 ) + return -constexpr_pi_by_2 - atanTaylor( 1 / xx ); + } + + // Constexpr implementation of atan (double signature, internally implemented as long double) + constexpr double constexpr_atan( const double x ) + { + return constexpr_atan_quad( x ); + } +} + +#endif // constexpr_math_h diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h index 7ede1dbfae..9ef1c44899 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h @@ -76,7 +76,8 @@ namespace mgOnGpu /* clang-format off */ }; template - inline __host__ __device__ cxsmpl // (NB: cannot be constexpr as a constexpr function cannot have a nonliteral return type "mgOnGpu::cxsmpl") + constexpr // (NB: now valid code? in the past this failed as "a constexpr function cannot have a nonliteral return type mgOnGpu::cxsmpl") + inline __host__ __device__ cxsmpl conj( const cxsmpl& c ) { return cxsmpl( c.real(), -c.imag() ); diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 63369c306b..d29d7ee235 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.116 s +1 processes with 3 diagrams generated in 0.119 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -582,7 +582,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.136 s +ALOHA: aloha creates 2 routines in 0.144 s VVV1 FFV1 FFV1 @@ -597,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.363s -user 0m1.211s -sys 0m0.065s -Code generation completed in 1 seconds +real 0m1.325s +user 0m1.233s +sys 0m0.061s +Code generation completed in 2 seconds From 8c40ac40b0ca83205c41efcf6cd9a18ce8fea9a6 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 19:37:06 +0100 Subject: [PATCH 62/96] [susy2] in ee_mumu.sa CPPProcess.cc, fix undefined bsmIndepParam by setting it as a nullptr, and for simplicity add the same code as in susy Now eemumu.sa builds and tests ok for both HRDCOD=0 and =1. --- .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 556b827c55..d15cfdf8f4 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype cIPC[6]; #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) From 2f398a9bf986fe94d3104841250464535566d31c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 19:41:48 +0100 Subject: [PATCH 63/96] [susy2] in CODEGEN, fix the build of ee_mumu.sa by adding the undefined bsmIndepParam as a nullptr, and for simplicity add the same code as in susy Now both susy_gg_tt and sm ee_mumu build and test ok for hrdcod=0/1 --- .../template_files/cpp_model_parameters_h.inc | 7 ++++- .../gpu/process_function_definitions.inc | 27 ++++++++++++++++++- .../CUDACPP_SA_OUTPUT/model_handling.py | 22 +-------------- 3 files changed, 33 insertions(+), 23 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index 972cebc051..b496a37f99 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -20,7 +20,12 @@ #include "constexpr_math.h" -//==========================================================================%(bsmdefine)s +//========================================================================== + +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +%(bsmdefine)s #ifndef MGONGPU_HARDCODE_PARAM%(eftwarn0)s diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 320d33cc45..a28f01916d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -51,7 +51,32 @@ namespace mg5amcCpu %(cipdstatic)s %(cipcstatic)s #endif -#endif%(bsmindepparam)s +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif +#endif // Helicity combinations (and filtering of "good" helicity combinations) #ifdef MGONGPUCPP_GPUIMPL diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 4925bcf0c2..36435207a2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -926,8 +926,8 @@ def super_generate_parameters_class_files(self): replace_dict['dcoupsetdcoup2'] = ' // (none)' replace_dict['dcoupoutdcoup2'] = '' # Require HRDCOD=1 in EFT and special handling in EFT for fptype=float using SIMD + replace_dict['bsmdefine'] = '#define MGONGPUCPP_NBSMINDEPPARAM_GT_0 1' if len( bsmparam_indep_real_used ) > 0 else '#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0' if self.model_name[:2] == 'sm' : - replace_dict['bsmdefine'] = '' replace_dict['bsmip0'] = '' replace_dict['bsmip1'] = '' replace_dict['eftwarn0'] = '' @@ -936,11 +936,6 @@ def super_generate_parameters_class_files(self): replace_dict['eftspecial1'] = ' // Begin SM implementation - no special handling of vectors of floats as in EFT (#439)' replace_dict['eftspecial2'] = ' // End SM implementation - no special handling of vectors of floats as in EFT (#439)' else: - replace_dict['bsmdefine'] = ''' - -// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 -// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated -%s''' % ( '#define MGONGPUCPP_NBSMINDEPPARAM_GT_0 1' if len( bsmparam_indep_real_used ) > 0 else '#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0' ) replace_dict['bsmip0'] = ''' // BSM parameters that do not depend on alphaS but are needed in the computation of alphaS-dependent couplings; static constexpr int nBsmIndepParam = %s; @@ -1206,25 +1201,10 @@ def get_process_function_definitions(self, write=True): replace_dict['cipddump'] = '' replace_dict['cipdhrdcod'] = '//__device__ const fptype* cIPD = nullptr; // unused as nparam=0' if self.model_name[:2] == 'sm' : - replace_dict['bsmindepparam'] = '' replace_dict['bsmMemcpySym'] = '' replace_dict['bsmMemcpy'] = '' replace_dict['bsmdump'] = '' else: - replace_dict['bsmindepparam'] = '''\n - // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 - // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated -#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 -#ifdef MGONGPU_HARDCODE_PARAM - __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; -#else -#ifdef MGONGPUCPP_GPUIMPL - __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; -#else - static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; -#endif -#endif -#endif''' replace_dict['bsmMemcpySym'] = '\n if( Parameters_MSSM_SLHA2::nBsmIndepParam > 0 )\n gpuMemcpyToSymbol( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) );' replace_dict['bsmMemcpy'] = '\n if( Parameters_MSSM_SLHA2::nBsmIndepParam > 0 )\n memcpy( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) );' replace_dict['bsmdump'] = '\n //for ( int i=0; imdl_bsmIndepParam[i] = " << m_pars->mdl_bsmIndepParam[i] << std::endl;' From 319b13f196852ae34010e24239c34bc5229eea9b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 19:52:20 +0100 Subject: [PATCH 64/96] [susy2] regenerate ee_mumu.sa and susy_gg_tt.sa, they build and test ok now --- .../ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt | 12 ++++++------ epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h | 5 +++++ .../susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt | 12 ++++++------ .../P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc | 11 +++++++++++ .../susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 1 + 5 files changed, 29 insertions(+), 12 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 3e3ecb225d..1dd5b8aa45 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005336284637451172  +DEBUG: model prefixing takes 0.005441427230834961  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -177,14 +177,14 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/ FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. -Generated helas calls for 1 subprocesses (2 diagrams) in 0.003 s +Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.264 s +ALOHA: aloha creates 4 routines in 0.266 s FFV1 FFV1 FFV2 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.703s -user 0m0.574s -sys 0m0.064s +real 0m0.722s +user 0m0.605s +sys 0m0.053s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h index 586ffc9955..187875fa75 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index d29d7ee235..4b03e54ea6 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.119 s +1 processes with 3 diagrams generated in 0.116 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -582,7 +582,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.136 s VVV1 FFV1 FFV1 @@ -597,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.325s -user 0m1.233s -sys 0m0.061s -Code generation completed in 2 seconds +real 0m1.286s +user 0m1.211s +sys 0m0.068s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc index bf63d1a80e..108c5a9f5f 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/P1_Sigma_MSSM_SLHA2_gg_ttx/CPPProcess.cc @@ -91,6 +91,7 @@ namespace mg5amcCpu // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) #ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 #ifdef MGONGPU_HARDCODE_PARAM __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; @@ -101,6 +102,16 @@ namespace mg5amcCpu static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; #endif #endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 8e3d187ddf..469b8633ee 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -26,6 +26,7 @@ // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) #define MGONGPUCPP_NBSMINDEPPARAM_GT_0 1 #ifndef MGONGPU_HARDCODE_PARAM From f778ae0eeaea7020e5dd482910862687f3e10d0e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 21:04:37 +0100 Subject: [PATCH 65/96] [susy2] regenerate all processes --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 18 ++-- .../SubProcesses/P1_epem_mupmum/CPPProcess.cc | 25 +++++ .../cudacpp/ee_mumu.mad/src/Parameters_sm.h | 5 + .../CODEGEN_cudacpp_ee_mumu_log.txt | 10 +- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 20 ++-- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 25 +++++ epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h | 5 + .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 12 +-- .../P1_Sigma_sm_gg_ttx/CPPProcess.cc | 25 +++++ epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h | 5 + .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 28 +++--- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 25 +++++ .../SubProcesses/P2_gg_ttxg/CPPProcess.cc | 25 +++++ .../cudacpp/gg_tt01g.mad/src/Parameters_sm.h | 5 + .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 24 ++--- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 25 +++++ epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h | 5 + .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 14 +-- .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc | 25 +++++ epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h | 5 + .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 22 ++--- .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc | 25 +++++ .../cudacpp/gg_ttgg.mad/src/Parameters_sm.h | 5 + .../CODEGEN_cudacpp_gg_ttgg_log.txt | 12 +-- .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc | 25 +++++ epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h | 5 + .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 22 ++--- .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc | 25 +++++ .../cudacpp/gg_ttggg.mad/src/Parameters_sm.h | 5 + .../CODEGEN_cudacpp_gg_ttggg_log.txt | 12 +-- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc | 25 +++++ .../cudacpp/gg_ttggg.sa/src/Parameters_sm.h | 5 + .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 24 ++--- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 25 +++++ .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 25 +++++ epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h | 5 + .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 14 +-- .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc | 25 +++++ .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc | 25 +++++ epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h | 5 + .../CODEGEN_cudacpp_heft_gg_h_log.txt | 58 +---------- .../P1_Sigma_heft_gg_h/CPPProcess.cc | 11 +++ .../heft_gg_h.sa/src/Parameters_heft.h | 1 + .../CODEGEN_mad_pp_tt012j_log.txt | 96 +++++++++---------- .../SubProcesses/P0_gg_ttx/CPPProcess.cc | 25 +++++ .../SubProcesses/P0_uux_ttx/CPPProcess.cc | 25 +++++ .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 25 +++++ .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 25 +++++ .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 25 +++++ .../SubProcesses/P1_uux_ttxg/CPPProcess.cc | 25 +++++ .../SubProcesses/P2_gg_ttxgg/CPPProcess.cc | 25 +++++ .../SubProcesses/P2_gg_ttxuux/CPPProcess.cc | 25 +++++ .../SubProcesses/P2_gu_ttxgu/CPPProcess.cc | 25 +++++ .../SubProcesses/P2_gux_ttxgux/CPPProcess.cc | 25 +++++ .../SubProcesses/P2_uc_ttxuc/CPPProcess.cc | 25 +++++ .../SubProcesses/P2_ucx_ttxucx/CPPProcess.cc | 25 +++++ .../SubProcesses/P2_uu_ttxuu/CPPProcess.cc | 25 +++++ .../SubProcesses/P2_uux_ttxccx/CPPProcess.cc | 25 +++++ .../SubProcesses/P2_uux_ttxgg/CPPProcess.cc | 25 +++++ .../SubProcesses/P2_uux_ttxuux/CPPProcess.cc | 25 +++++ .../P2_uxcx_ttxuxcx/CPPProcess.cc | 25 +++++ .../P2_uxux_ttxuxux/CPPProcess.cc | 25 +++++ .../cudacpp/pp_tt012j.mad/src/Parameters_sm.h | 5 + 63 files changed, 1071 insertions(+), 217 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 0142beb14a..2016ca9239 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005445718765258789  +DEBUG: model prefixing takes 0.005757570266723633  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -176,8 +176,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,19 +194,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.099 s +Wrote files for 8 helas calls in 0.102 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.213 s +ALOHA: aloha creates 3 routines in 0.207 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.276 s +ALOHA: aloha creates 7 routines in 0.262 s FFV1 FFV1 FFV2 @@ -250,9 +250,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.906s -user 0m1.670s -sys 0m0.218s +real 0m1.933s +user 0m1.699s +sys 0m0.217s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 6b66832a3d..57b343359f 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype cIPC[6]; #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h index 586ffc9955..187875fa75 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 1dd5b8aa45..949e09216e 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005441427230834961  +DEBUG: model prefixing takes 0.005952119827270508  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -184,7 +184,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.266 s +ALOHA: aloha creates 4 routines in 0.274 s FFV1 FFV1 FFV2 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.722s -user 0m0.605s -sys 0m0.053s +real 0m0.695s +user 0m0.602s +sys 0m0.067s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 78f6fa5a2b..11cf37750a 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005376338958740234  +DEBUG: model prefixing takes 0.0054476261138916016  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -177,8 +177,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,16 +194,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.102 s +Wrote files for 10 helas calls in 0.103 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.781 s +ALOHA: aloha creates 2 routines in 0.151 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.215 s +ALOHA: aloha creates 4 routines in 0.135 s VVV1 FFV1 FFV1 @@ -239,10 +239,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.602s -user 0m1.487s -sys 0m0.213s -Code generation completed in 3 seconds +real 0m1.797s +user 0m1.502s +sys 0m0.224s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 3bda469e3e..db5b9e982f 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h index e1493c899c..2952dd5399 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 8d61954ee9..a43219a222 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005337953567504883  +DEBUG: model prefixing takes 0.0053577423095703125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.009 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -198,7 +198,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.549s -user 0m0.473s -sys 0m0.052s -Code generation completed in 0 seconds +real 0m0.540s +user 0m0.481s +sys 0m0.056s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index 9931c27ce7..1940c4fa19 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h index e1493c899c..2952dd5399 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 47a3a30985..e6ed8e10f8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00570368766784668  +DEBUG: model prefixing takes 0.005404949188232422  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams add process g g > t t~ g INFO: Checking for minimal orders which gives processes. @@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.020 s +1 processes with 16 diagrams generated in 0.019 s Total: 2 processes with 19 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -187,8 +187,8 @@ INFO: Processing color information for process: g g > t t~ g @2 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -204,8 +204,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -221,14 +221,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s -Wrote files for 46 helas calls in 0.241 s +Wrote files for 46 helas calls in 0.238 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.326 s +ALOHA: aloha creates 5 routines in 0.320 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -236,7 +236,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.310 s +ALOHA: aloha creates 10 routines in 0.309 s VVV1 VVV1 FFV1 @@ -285,10 +285,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.395s -user 0m2.049s -sys 0m0.237s -Code generation completed in 2 seconds +real 0m2.352s +user 0m2.041s +sys 0m0.212s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 3bda469e3e..db5b9e982f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index 4d3ef72615..61e9a4ce01 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h index 5660a06a4d..332b14bc7b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 9cc19e9825..9b864eb2aa 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005432605743408203  +DEBUG: model prefixing takes 0.0058307647705078125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.021 s +1 processes with 16 diagrams generated in 0.023 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -177,8 +177,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s -Wrote files for 36 helas calls in 0.152 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s +Wrote files for 36 helas calls in 0.156 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.327 s +ALOHA: aloha creates 5 routines in 0.342 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.310 s +ALOHA: aloha creates 10 routines in 0.333 s VVV1 VVV1 FFV1 @@ -254,10 +254,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.223s -user 0m1.930s -sys 0m0.226s -Code generation completed in 2 seconds +real 0m2.664s +user 0m2.011s +sys 0m0.247s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 6611b68803..0a14b03d80 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h index 5660a06a4d..332b14bc7b 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index c69338efb0..28c2032078 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005579233169555664  +DEBUG: model prefixing takes 0.005526304244995117  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.021 s +1 processes with 16 diagrams generated in 0.023 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/ FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.323 s +ALOHA: aloha creates 5 routines in 0.347 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.773s -user 0m0.714s -sys 0m0.051s +real 0m0.841s +user 0m0.757s +sys 0m0.056s Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 081e714ee5..f39085c55e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h index 5660a06a4d..332b14bc7b 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 9294477de4..c3e8bfe3e2 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005306720733642578  +DEBUG: model prefixing takes 0.005539417266845703  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.159 s +1 processes with 123 diagrams generated in 0.166 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -177,8 +177,8 @@ INFO: Organizing processes into subprocess groups INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.425 s -Wrote files for 222 helas calls in 0.687 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.422 s +Wrote files for 222 helas calls in 0.693 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.327 s +ALOHA: aloha creates 5 routines in 0.331 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.309 s +ALOHA: aloha creates 10 routines in 0.312 s VVV1 VVV1 FFV1 @@ -257,9 +257,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.366s -user 0m2.984s -sys 0m0.248s +real 0m3.928s +user 0m3.024s +sys 0m0.234s Code generation completed in 4 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index b7e40edf31..6d4edf8dfd 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h index 5660a06a4d..332b14bc7b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index f2e8e57a44..fc1bed2e26 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005572795867919922  +DEBUG: model prefixing takes 0.005308628082275391  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.315 s +ALOHA: aloha creates 5 routines in 0.331 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.522s -user 0m1.362s -sys 0m0.061s -Code generation completed in 2 seconds +real 0m4.175s +user 0m1.386s +sys 0m0.062s +Code generation completed in 5 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index e6eb707fa4..40d227774f 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h index 5660a06a4d..332b14bc7b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 79e09690a5..d2a36d753e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005720376968383789  +DEBUG: model prefixing takes 0.005647897720336914  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.873 s +1 processes with 1240 diagrams generated in 1.864 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -179,8 +179,8 @@ INFO: Processing color information for process: g g > t t~ g g g @1 INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -195,15 +195,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.567 s -Wrote files for 2281 helas calls in 18.534 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.504 s +Wrote files for 2281 helas calls in 18.405 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.314 s +ALOHA: aloha creates 5 routines in 0.321 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -211,7 +211,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.308 s +ALOHA: aloha creates 10 routines in 0.325 s VVV1 VVV1 FFV1 @@ -259,9 +259,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m29.191s -user 0m28.652s -sys 0m0.431s +real 0m29.042s +user 0m28.517s +sys 0m0.396s Code generation completed in 29 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index ed2ea4ce28..78a4e3812f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h index 5660a06a4d..332b14bc7b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index b33a603bdb..417850ac12 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0054645538330078125  +DEBUG: model prefixing takes 0.0054471492767333984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.881 s +1 processes with 1240 diagrams generated in 1.878 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/ FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.597 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.609 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m13.328s -user 0m12.880s -sys 0m0.107s +real 0m13.327s +user 0m12.815s +sys 0m0.113s Code generation completed in 13 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index fd7d52e24c..82ecd769d2 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h index 5660a06a4d..332b14bc7b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 654a104b7a..834281d14a 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00540471076965332  +DEBUG: model prefixing takes 0.00538182258605957  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -200,8 +200,8 @@ INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -217,8 +217,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -233,17 +233,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s -Wrote files for 32 helas calls in 0.217 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s +Wrote files for 32 helas calls in 0.216 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.151 s +ALOHA: aloha creates 2 routines in 0.147 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.131 s +ALOHA: aloha creates 4 routines in 0.130 s FFV1 FFV1 FFV1 @@ -296,9 +296,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.937s -user 0m1.715s -sys 0m0.200s +real 0m1.914s +user 0m1.686s +sys 0m0.218s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 00c62e5820..ff4c1ec813 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 5006ce7768..2ecd2efcbb 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h index 00b35be0b5..4e8c6e6e56 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 3451aaba00..ee0ab514b9 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005368471145629883  +DEBUG: model prefixing takes 0.005761861801147461  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.083 s +8 processes with 40 diagrams generated in 0.081 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -210,12 +210,12 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/ FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.149 s FFV1 FFV1 FFV1 @@ -231,7 +231,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.659s -user 0m0.579s -sys 0m0.068s +real 0m1.488s +user 0m0.607s +sys 0m0.049s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index e82385b24d..c8781251f2 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index 0fad1abce3..d7da8c15f2 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h index 00b35be0b5..4e8c6e6e56 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 5af82fa96a..212a7d3db3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -62,54 +62,6 @@ set auto_convert_model T save options auto_convert_model save configuration file to /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft -INFO: download model from http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz to the following directory: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/models  ---2024-02-29 19:27:58-- http://madgraph.phys.ucl.ac.be/Downloads/models/heft.tgz -Resolving madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)... 130.104.1.243 -Connecting to madgraph.phys.ucl.ac.be (madgraph.phys.ucl.ac.be)|130.104.1.243|:80... connected. -HTTP request sent, awaiting response... 200 OK -Length: 50876 (50K) [application/x-gzip] -Saving to: ‘tmp.tgz’ - - 0K .......... .......... .......... .......... ......... 100% 1.03M=0.05s - -2024-02-29 19:27:58 (1.03 MB/s) - ‘tmp.tgz’ saved [50876/50876] - -heft/ -heft/write_param_card.py -heft/restrict_ckm.dat -heft/couplings.py -heft/HEFT_UFO.log -heft/lorentz.py -heft/__init__.py -heft/__pycache__/ -heft/particles.py -heft/object_library.py -heft/restrict_default.dat -heft/restrict_zeromass_ckm.dat -heft/restrict_no_b_mass.dat -heft/function_library.py -heft/parameters.py -heft/py3_model.pkl -heft/coupling_orders.py -heft/restrict_no_tau_mass.dat -heft/vertices.py -heft/restrict_no_masses.dat -heft/__pycache__/write_param_card.cpython-311.pyc -heft/__pycache__/parameters.cpython-311.pyc -heft/__pycache__/function_library.cpython-311.pyc -heft/__pycache__/coupling_orders.cpython-311.pyc -heft/__pycache__/object_library.cpython-311.pyc -heft/__pycache__/couplings.cpython-311.pyc -heft/__pycache__/particles.cpython-311.pyc -heft/__pycache__/vertices.cpython-311.pyc -heft/__pycache__/lorentz.cpython-311.pyc -heft/__pycache__/__init__.cpython-311.pyc -INFO: reload from .py file -INFO: load particles -INFO: load vertices -WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  -DEBUG: model prefixing takes 0.006208658218383789  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -204,7 +156,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.071 s +ALOHA: aloha creates 1 routines in 0.060 s VVS3 FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. @@ -216,7 +168,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. quit -real 0m1.586s -user 0m0.442s -sys 0m0.073s -Code generation completed in 1 seconds +real 0m0.413s +user 0m0.363s +sys 0m0.047s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc index 319ca3e12c..56063a2b34 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc @@ -91,6 +91,7 @@ namespace mg5amcCpu // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) #ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 #ifdef MGONGPU_HARDCODE_PARAM __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; @@ -101,6 +102,16 @@ namespace mg5amcCpu static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; #endif #endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h index 9ee9c417f2..c4ac3a4bcc 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h @@ -26,6 +26,7 @@ // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) #undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 #ifndef MGONGPU_HARDCODE_PARAM diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index e2da496917..01a922244d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00541996955871582  +DEBUG: model prefixing takes 0.0053021907806396484  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.030 s +5 processes with 7 diagrams generated in 0.029 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.137 s +13 processes with 76 diagrams generated in 0.135 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.831 s +65 processes with 1119 diagrams generated in 1.808 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -499,8 +499,8 @@ INFO: Combined process c c~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -516,8 +516,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -533,8 +533,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -550,8 +550,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -567,8 +567,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -584,8 +584,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -601,8 +601,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -618,8 +618,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -635,8 +635,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -652,8 +652,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -669,8 +669,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -686,8 +686,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -703,8 +703,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -720,8 +720,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -737,8 +737,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -754,8 +754,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -771,8 +771,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -788,8 +788,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx -DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1118]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -804,15 +804,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.288 s -Wrote files for 810 helas calls in 3.872 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.268 s +Wrote files for 810 helas calls in 3.224 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.334 s +ALOHA: aloha creates 5 routines in 0.347 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -820,7 +820,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.318 s +ALOHA: aloha creates 10 routines in 0.316 s VVV1 VVV1 FFV1 @@ -1030,10 +1030,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m9.475s -user 0m8.365s -sys 0m0.419s -Code generation completed in 10 seconds +real 0m8.737s +user 0m8.290s +sys 0m0.416s +Code generation completed in 9 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index f2e34f3cc4..23fd9fb7df 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index 87e2241a5e..13bb9e58c1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 6611b68803..0a14b03d80 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 61576da03d..1a93011512 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index ec8a171811..91b6d21fc2 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index 1df0359166..9076b52d2e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index 63dc41ba9c..b18547af14 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -87,6 +87,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index b0462c7741..a3f295ac2c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index 44603af8fa..332d428750 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index 76494fea01..cd479188d7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index 9e0123b38e..3db334100f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -92,6 +92,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index 3adb4e237d..95c9e8cec1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -98,6 +98,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index b3a3804c9d..6997872f83 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index 4dc65ee56c..3f28f127f6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -98,6 +98,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 343d0f7ef7..e848eb2876 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index b2b189eeea..e8eba39ab3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index 9321d65fe4..1ae0519ec0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -92,6 +92,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index 073e575c03..f662a7cd7b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -90,6 +90,31 @@ namespace mg5amcCpu static fptype cIPD[2]; static fptype* cIPC = nullptr; // unused as nicoup=0 #endif +#endif + + // AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 + // The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated + // For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#ifdef MGONGPUCPP_NBSMINDEPPARAM_GT_0 +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = Parameters_MSSM_SLHA2::mdl_bsmIndepParam; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#else + static double bsmIndepParam[Parameters_MSSM_SLHA2::nBsmIndepParam]; +#endif +#endif +#else +#ifdef MGONGPU_HARDCODE_PARAM + __device__ const double* bsmIndepParam = nullptr; +#else +#ifdef MGONGPUCPP_GPUIMPL + __device__ __constant__ double* bsmIndepParam = nullptr; +#else + static double* bsmIndepParam = nullptr; +#endif +#endif #endif // Helicity combinations (and filtering of "good" helicity combinations) diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h index 5660a06a4d..332b14bc7b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h @@ -24,6 +24,11 @@ //========================================================================== +// AV Jan 2024 (PR #625): this ugly #define was the only way I found to avoid creating arrays[nBsm] in CPPProcess.cc if nBsm is 0 +// The problem is that nBsm is determined when generating Parameters.h, which happens after CPPProcess.cc has already been generated +// For simplicity, keep this code hardcoded also for SM processes (a nullptr is needed as in the case nBsm == 0) +#undef MGONGPUCPP_NBSMINDEPPARAM_GT_0 + #ifndef MGONGPU_HARDCODE_PARAM #include "read_slha.h" From 022e2f6202c56c240d05dcbc9cc318da8e0a45c0 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 21:40:18 +0100 Subject: [PATCH 66/96] [susy2] in susy_gg_tt.sa, fix FPTYPE=f builds --- epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h index 469b8633ee..4dc38e5ce1 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/Parameters_MSSM_SLHA2.h @@ -805,12 +805,12 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_MSSM_SLHA2; #else - const fptype mdl_I51x11 = bsmIndepParamPtr[0]; + const double mdl_I51x11 = bsmIndepParamPtr[0]; #endif // NB: hardcode cxtype cI(0,1) instead of cxtype (or hardcoded cxsmpl) mdl_complexi (which exists in Parameters_MSSM_SLHA2) because: // (1) mdl_complexi is always (0,1); (2) mdl_complexi is undefined in device code; (3) need cxsmpl conversion to cxtype in code below @@ -882,7 +882,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_MSSM_SLHA2_dependentCouplings; From dcb55e4be18218efff66be0332bb78e97f869595 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 21:43:20 +0100 Subject: [PATCH 67/96] [susy2] in CODEGEN, backport from susy_gg_tt.sa the fix for FPTYPE=f builds --- .../madgraph/iolibs/template_files/cpp_model_parameters_h.inc | 4 ++-- .../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index b496a37f99..25c7c2fb82 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -153,7 +153,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_%(model_name)s; @@ -200,7 +200,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_%(model_name)s_dependentCouplings; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 36435207a2..fea841ff14 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -949,7 +949,7 @@ def super_generate_parameters_class_files(self): if len( bsmparam_indep_real_used ) == 0: replace_dict['eftspecial0'] = ' // No additional parameters needed in constant memory for this BSM model' else: - replace_dict['eftspecial0'] = '\n'.join( ' const fptype %s = bsmIndepParamPtr[%i];' % ( par, ipar ) for ipar, par in enumerate( bsmparam_indep_real_used ) ) + replace_dict['eftspecial0'] = '\n'.join( ' const double %s = bsmIndepParamPtr[%i];' % ( par, ipar ) for ipar, par in enumerate( bsmparam_indep_real_used ) ) replace_dict['eftspecial1'] = ' // Begin non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439)' replace_dict['eftspecial1'] += '\n#if not( defined MGONGPU_CPPSIMD && defined MGONGPU_FPTYPE_FLOAT )' replace_dict['eftspecial2'] = """#else From 81476cf5ce6a262f80c16fedf9bd7e0ce128df50 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 21:43:49 +0100 Subject: [PATCH 68/96] [susy2] regenerate susy_gg_tt.sa, all ok --- .../susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index 4b03e54ea6..b232f3a2d2 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.116 s +1 processes with 3 diagrams generated in 0.115 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -582,7 +582,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.136 s +ALOHA: aloha creates 2 routines in 0.135 s VVV1 FFV1 FFV1 @@ -597,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.286s -user 0m1.211s -sys 0m0.068s -Code generation completed in 1 seconds +real 0m1.268s +user 0m1.196s +sys 0m0.064s +Code generation completed in 2 seconds From d5a293634120e1223951e49c9f13442f7062f845 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 21:47:53 +0100 Subject: [PATCH 69/96] [susy2] regenerate all processes after fixing FPTYPE=f builds --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 14 ++--- .../cudacpp/ee_mumu.mad/src/Parameters_sm.h | 4 +- .../CODEGEN_cudacpp_ee_mumu_log.txt | 10 ++-- epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h | 4 +- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 16 ++--- epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h | 4 +- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 12 ++-- epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h | 4 +- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 18 +++--- .../cudacpp/gg_tt01g.mad/src/Parameters_sm.h | 4 +- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 20 +++---- epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h | 4 +- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 16 ++--- epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h | 4 +- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 18 +++--- .../cudacpp/gg_ttgg.mad/src/Parameters_sm.h | 4 +- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 16 ++--- epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h | 4 +- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 20 +++---- .../cudacpp/gg_ttggg.mad/src/Parameters_sm.h | 4 +- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 14 ++--- .../cudacpp/gg_ttggg.sa/src/Parameters_sm.h | 4 +- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 20 +++---- epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h | 4 +- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 12 ++-- epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h | 4 +- .../CODEGEN_cudacpp_heft_gg_h_log.txt | 8 +-- .../heft_gg_h.sa/src/Parameters_heft.h | 4 +- .../CODEGEN_mad_pp_tt012j_log.txt | 60 +++++++++---------- .../cudacpp/pp_tt012j.mad/src/Parameters_sm.h | 4 +- 30 files changed, 167 insertions(+), 167 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 2016ca9239..436ccf2787 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005757570266723633  +DEBUG: model prefixing takes 0.005387067794799805  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -177,7 +177,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,12 +194,12 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.102 s +Wrote files for 8 helas calls in 0.099 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.207 s +ALOHA: aloha creates 3 routines in 0.198 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines @@ -250,9 +250,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.933s -user 0m1.699s -sys 0m0.217s +real 0m2.196s +user 0m1.648s +sys 0m0.226s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h index 187875fa75..05b04932c0 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h @@ -216,7 +216,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -265,7 +265,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 949e09216e..e193525ae5 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005952119827270508  +DEBUG: model prefixing takes 0.0053081512451171875  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -184,7 +184,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.274 s +ALOHA: aloha creates 4 routines in 0.266 s FFV1 FFV1 FFV2 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.695s -user 0m0.602s -sys 0m0.067s +real 0m0.653s +user 0m0.593s +sys 0m0.050s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h index 187875fa75..05b04932c0 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h @@ -216,7 +216,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -265,7 +265,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 11cf37750a..c85bdd089c 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0054476261138916016  +DEBUG: model prefixing takes 0.005442142486572266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -194,16 +194,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.103 s +Wrote files for 10 helas calls in 0.101 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.151 s +ALOHA: aloha creates 2 routines in 0.144 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.135 s +ALOHA: aloha creates 4 routines in 0.132 s VVV1 FFV1 FFV1 @@ -239,9 +239,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.797s -user 0m1.502s -sys 0m0.224s +real 0m1.828s +user 0m1.489s +sys 0m0.214s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h index 2952dd5399..7ec865da18 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h @@ -217,7 +217,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -267,7 +267,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index a43219a222..fa994d07bb 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0053577423095703125  +DEBUG: model prefixing takes 0.005362987518310547  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -198,7 +198,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.540s -user 0m0.481s -sys 0m0.056s -Code generation completed in 1 seconds +real 0m0.591s +user 0m0.466s +sys 0m0.061s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h index 2952dd5399..7ec865da18 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h @@ -217,7 +217,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -267,7 +267,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index e6ed8e10f8..5d7837d53d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005404949188232422  +DEBUG: model prefixing takes 0.005667924880981445  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -188,7 +188,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -205,7 +205,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -221,14 +221,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s -Wrote files for 46 helas calls in 0.238 s +Wrote files for 46 helas calls in 0.241 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.320 s +ALOHA: aloha creates 5 routines in 0.327 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -236,7 +236,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.309 s +ALOHA: aloha creates 10 routines in 0.313 s VVV1 VVV1 FFV1 @@ -285,9 +285,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.352s -user 0m2.041s -sys 0m0.212s +real 0m2.412s +user 0m2.052s +sys 0m0.228s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h index 332b14bc7b..95e689c562 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h @@ -220,7 +220,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -271,7 +271,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 9b864eb2aa..0e2ccde1b8 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0058307647705078125  +DEBUG: model prefixing takes 0.005261898040771484  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.023 s +1 processes with 16 diagrams generated in 0.022 s Total: 1 processes with 16 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s -Wrote files for 36 helas calls in 0.156 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s +Wrote files for 36 helas calls in 0.148 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.342 s +ALOHA: aloha creates 5 routines in 0.326 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -209,7 +209,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.333 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -254,9 +254,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.664s -user 0m2.011s -sys 0m0.247s +real 0m2.172s +user 0m1.939s +sys 0m0.225s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h index 332b14bc7b..95e689c562 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h @@ -220,7 +220,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -271,7 +271,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 28c2032078..687e5776a8 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005526304244995117  +DEBUG: model prefixing takes 0.005387306213378906  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.023 s +1 processes with 16 diagrams generated in 0.022 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/ FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.347 s +ALOHA: aloha creates 5 routines in 0.325 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.841s -user 0m0.757s -sys 0m0.056s -Code generation completed in 0 seconds +real 0m0.834s +user 0m0.714s +sys 0m0.055s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h index 332b14bc7b..95e689c562 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h @@ -220,7 +220,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -271,7 +271,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index c3e8bfe3e2..fd2622a44a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005539417266845703  +DEBUG: model prefixing takes 0.005427122116088867  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.166 s +1 processes with 123 diagrams generated in 0.157 s Total: 1 processes with 123 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -193,15 +193,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.422 s -Wrote files for 222 helas calls in 0.693 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.452 s +Wrote files for 222 helas calls in 0.734 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.331 s +ALOHA: aloha creates 5 routines in 0.359 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -257,9 +257,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.928s -user 0m3.024s -sys 0m0.234s +real 0m4.344s +user 0m3.114s +sys 0m0.252s Code generation completed in 4 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h index 332b14bc7b..95e689c562 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h @@ -220,7 +220,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -271,7 +271,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index fc1bed2e26..1d54f84638 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005308628082275391  +DEBUG: model prefixing takes 0.005550861358642578  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.159 s +1 processes with 123 diagrams generated in 0.158 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/ FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.420 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.424 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.331 s +ALOHA: aloha creates 5 routines in 0.318 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m4.175s -user 0m1.386s -sys 0m0.062s -Code generation completed in 5 seconds +real 0m1.469s +user 0m1.381s +sys 0m0.050s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h index 332b14bc7b..95e689c562 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h @@ -220,7 +220,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -271,7 +271,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index d2a36d753e..a6f2209ab5 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005647897720336914  +DEBUG: model prefixing takes 0.005267620086669922  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.864 s +1 processes with 1240 diagrams generated in 1.873 s Total: 1 processes with 1240 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -180,7 +180,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -195,15 +195,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.504 s -Wrote files for 2281 helas calls in 18.405 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.521 s +Wrote files for 2281 helas calls in 18.398 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.321 s +ALOHA: aloha creates 5 routines in 0.319 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -211,7 +211,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.325 s +ALOHA: aloha creates 10 routines in 0.315 s VVV1 VVV1 FFV1 @@ -259,9 +259,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m29.042s -user 0m28.517s -sys 0m0.396s +real 0m29.077s +user 0m28.548s +sys 0m0.372s Code generation completed in 29 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h index 332b14bc7b..95e689c562 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h @@ -220,7 +220,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -271,7 +271,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 417850ac12..df4e64b156 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0054471492767333984  +DEBUG: model prefixing takes 0.0057828426361083984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.878 s +1 processes with 1240 diagrams generated in 1.873 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -178,7 +178,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/ FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.609 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.598 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -186,7 +186,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.345 s +ALOHA: aloha creates 5 routines in 0.347 s VVV1 VVV1 FFV1 @@ -209,7 +209,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m13.327s -user 0m12.815s -sys 0m0.113s +real 0m13.077s +user 0m12.915s +sys 0m0.098s Code generation completed in 13 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h index 332b14bc7b..95e689c562 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h @@ -220,7 +220,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -271,7 +271,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 834281d14a..44c049bb34 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00538182258605957  +DEBUG: model prefixing takes 0.005309581756591797  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.077 s +8 processes with 40 diagrams generated in 0.078 s Total: 8 processes with 40 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -201,7 +201,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -218,7 +218,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -233,17 +233,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s Wrote files for 32 helas calls in 0.216 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.147 s +ALOHA: aloha creates 2 routines in 0.144 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.130 s +ALOHA: aloha creates 4 routines in 0.131 s FFV1 FFV1 FFV1 @@ -296,9 +296,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.914s -user 0m1.686s -sys 0m0.218s +real 0m1.993s +user 0m1.682s +sys 0m0.220s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h index 4e8c6e6e56..a5f984c7db 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h @@ -217,7 +217,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -267,7 +267,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index ee0ab514b9..d03914f912 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005761861801147461  +DEBUG: model prefixing takes 0.005572319030761719  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.081 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -215,7 +215,7 @@ Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.149 s +ALOHA: aloha creates 2 routines in 0.144 s FFV1 FFV1 FFV1 @@ -231,7 +231,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m1.488s -user 0m0.607s -sys 0m0.049s +real 0m0.815s +user 0m0.589s +sys 0m0.053s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h index 4e8c6e6e56..a5f984c7db 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h @@ -217,7 +217,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -267,7 +267,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 212a7d3db3..629579c0aa 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -168,7 +168,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. quit -real 0m0.413s -user 0m0.363s -sys 0m0.047s -Code generation completed in 0 seconds +real 0m0.434s +user 0m0.362s +sys 0m0.050s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h index c4ac3a4bcc..6a00e34987 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_heft; @@ -310,7 +310,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_heft_dependentCouplings; diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 01a922244d..3a9a6e1a7e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0053021907806396484  +DEBUG: model prefixing takes 0.005480527877807617  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.029 s +5 processes with 7 diagrams generated in 0.030 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.135 s +13 processes with 76 diagrams generated in 0.138 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.808 s +65 processes with 1119 diagrams generated in 1.816 s Total: 83 processes with 1202 diagrams output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 Load PLUGIN.CUDACPP_OUTPUT @@ -500,7 +500,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -517,7 +517,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -534,7 +534,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -551,7 +551,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -568,7 +568,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -585,7 +585,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -602,7 +602,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -619,7 +619,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -636,7 +636,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -653,7 +653,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -670,7 +670,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -687,7 +687,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -704,7 +704,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -721,7 +721,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -738,7 +738,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -755,7 +755,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -772,7 +772,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -789,7 +789,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1113]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6261]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -804,15 +804,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1871]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.268 s -Wrote files for 810 helas calls in 3.224 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.307 s +Wrote files for 810 helas calls in 3.230 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.347 s +ALOHA: aloha creates 5 routines in 0.338 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 204]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -820,7 +820,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.316 s +ALOHA: aloha creates 10 routines in 0.318 s VVV1 VVV1 FFV1 @@ -1030,10 +1030,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m8.737s -user 0m8.290s -sys 0m0.416s -Code generation completed in 9 seconds +real 0m10.119s +user 0m8.317s +sys 0m0.432s +Code generation completed in 10 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h index 332b14bc7b..95e689c562 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h @@ -220,7 +220,7 @@ namespace mg5amcCpu #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif - __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const fptype* bsmIndepParamPtr ) + __host__ __device__ inline const DependentCouplings_sv computeDependentCouplings_fromG( const fptype_sv& G_sv, const double* bsmIndepParamPtr ) { #ifdef MGONGPU_HARDCODE_PARAM using namespace Parameters_sm; @@ -271,7 +271,7 @@ namespace mg5amcCpu __device__ inline void G2COUP( const fptype gs[], fptype couplings[], - const fptype* bsmIndepParamPtr ) + const double* bsmIndepParamPtr ) { mgDebug( 0, __FUNCTION__ ); using namespace Parameters_sm_dependentCouplings; From 7e7c99a22ec2ea601a276b67f185fd7c346b7b07 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 22:27:16 +0100 Subject: [PATCH 70/96] [susy2] in susy_gg_tt.sa, use literals for pi constants as M_PIl is not defined on Mac --- .../CODEGEN_cudacpp_susy_gg_tt_log.txt | 6 +++--- .../cudacpp/susy_gg_tt.sa/src/constexpr_math.h | 17 +++++++++++++---- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index b232f3a2d2..db46a05608 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.115 s +1 processes with 3 diagrams generated in 0.117 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -597,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.268s -user 0m1.196s +real 0m1.484s +user 0m1.210s sys 0m0.064s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h index 78ff8b16ab..ff5649fac7 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h +++ b/epochX/cudacpp/susy_gg_tt.sa/src/constexpr_math.h @@ -57,10 +57,19 @@ namespace mg5amcCpu return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); } - // PI from cmath - constexpr long double constexpr_pi = M_PIl; // pi - constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 - constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + // PI constants + // NB1: M_PIl from from cmath is not defined on Mac + // NB2: std::numbers::pi needs c++20 but we are still using c++17 + // NB3: I could use my constexpr_atan(1)*4... but a literal is better? + //constexpr long double constexpr_pi = M_PIl; // pi + //constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + //constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + constexpr long double constexpr_pi = 3.141592653589793238462643383279502884L; // same as M_PIl in gcc + constexpr long double constexpr_pi_by_2 = 1.570796326794896619231321691639751442L; // same as M_PI_2l in gcc + constexpr long double constexpr_pi_by_4 = 0.785398163397448309615660845819875721L; // same as M_PI_4l in gcc + static_assert( constexpr_pi_by_4 * 4 == constexpr_pi ); + static_assert( constexpr_pi_by_4 * 2 == constexpr_pi_by_2 ); + static_assert( constexpr_pi_by_2 * 2 == constexpr_pi ); // Constexpr implementation of sin for 0 Date: Thu, 29 Feb 2024 22:28:28 +0100 Subject: [PATCH 71/96] [susy2] in CODEGEN backporting from susy_gg_tt.sa, use literals for pi constants as M_PIl is not defined on Mac --- .../iolibs/template_files/gpu/constexpr_math.h | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h index 78ff8b16ab..ff5649fac7 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/constexpr_math.h @@ -57,10 +57,19 @@ namespace mg5amcCpu return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); } - // PI from cmath - constexpr long double constexpr_pi = M_PIl; // pi - constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 - constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + // PI constants + // NB1: M_PIl from from cmath is not defined on Mac + // NB2: std::numbers::pi needs c++20 but we are still using c++17 + // NB3: I could use my constexpr_atan(1)*4... but a literal is better? + //constexpr long double constexpr_pi = M_PIl; // pi + //constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + //constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + constexpr long double constexpr_pi = 3.141592653589793238462643383279502884L; // same as M_PIl in gcc + constexpr long double constexpr_pi_by_2 = 1.570796326794896619231321691639751442L; // same as M_PI_2l in gcc + constexpr long double constexpr_pi_by_4 = 0.785398163397448309615660845819875721L; // same as M_PI_4l in gcc + static_assert( constexpr_pi_by_4 * 4 == constexpr_pi ); + static_assert( constexpr_pi_by_4 * 2 == constexpr_pi_by_2 ); + static_assert( constexpr_pi_by_2 * 2 == constexpr_pi ); // Constexpr implementation of sin for 0 Date: Thu, 29 Feb 2024 22:29:32 +0100 Subject: [PATCH 72/96] [susy2] regenerate susy_gg_tt.sa, all ok --- .../susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index db46a05608..e0884e66d0 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.117 s +1 processes with 3 diagrams generated in 0.116 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -582,7 +582,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.135 s +ALOHA: aloha creates 2 routines in 0.137 s VVV1 FFV1 FFV1 @@ -597,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.484s -user 0m1.210s -sys 0m0.064s -Code generation completed in 2 seconds +real 0m1.275s +user 0m1.201s +sys 0m0.061s +Code generation completed in 1 seconds From 1c1d9a9de242a22b15068edf1092cdd1dd07429a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 22:31:05 +0100 Subject: [PATCH 73/96] [susy2] manually copy the new constexpr_math.h with the fix for Mac to all processes for f in $(git ls-tree --name-only HEAD */src/constexpr_math.h); do \cp susy_gg_tt.sa/src/constexpr_math.h $f; done --- epochX/cudacpp/ee_mumu.mad/src/constexpr_math.h | 17 +++++++++++++---- epochX/cudacpp/ee_mumu.sa/src/constexpr_math.h | 17 +++++++++++++---- epochX/cudacpp/gg_tt.mad/src/constexpr_math.h | 17 +++++++++++++---- epochX/cudacpp/gg_tt.sa/src/constexpr_math.h | 17 +++++++++++++---- .../cudacpp/gg_tt01g.mad/src/constexpr_math.h | 17 +++++++++++++---- epochX/cudacpp/gg_ttg.mad/src/constexpr_math.h | 17 +++++++++++++---- epochX/cudacpp/gg_ttg.sa/src/constexpr_math.h | 17 +++++++++++++---- epochX/cudacpp/gg_ttgg.mad/src/constexpr_math.h | 17 +++++++++++++---- epochX/cudacpp/gg_ttgg.sa/src/constexpr_math.h | 17 +++++++++++++---- .../cudacpp/gg_ttggg.mad/src/constexpr_math.h | 17 +++++++++++++---- epochX/cudacpp/gg_ttggg.sa/src/constexpr_math.h | 17 +++++++++++++---- epochX/cudacpp/gq_ttq.mad/src/constexpr_math.h | 17 +++++++++++++---- epochX/cudacpp/gq_ttq.sa/src/constexpr_math.h | 17 +++++++++++++---- .../cudacpp/heft_gg_h.sa/src/constexpr_math.h | 17 +++++++++++++---- .../cudacpp/pp_tt012j.mad/src/constexpr_math.h | 17 +++++++++++++---- 15 files changed, 195 insertions(+), 60 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/src/constexpr_math.h b/epochX/cudacpp/ee_mumu.mad/src/constexpr_math.h index 78ff8b16ab..ff5649fac7 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/constexpr_math.h +++ b/epochX/cudacpp/ee_mumu.mad/src/constexpr_math.h @@ -57,10 +57,19 @@ namespace mg5amcCpu return iexp == 0 ? 1 : base * constexpr_pow( base, iexp - 1 ); } - // PI from cmath - constexpr long double constexpr_pi = M_PIl; // pi - constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 - constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + // PI constants + // NB1: M_PIl from from cmath is not defined on Mac + // NB2: std::numbers::pi needs c++20 but we are still using c++17 + // NB3: I could use my constexpr_atan(1)*4... but a literal is better? + //constexpr long double constexpr_pi = M_PIl; // pi + //constexpr long double constexpr_pi_by_2 = M_PI_2l; // pi/2 + //constexpr long double constexpr_pi_by_4 = M_PI_4l; // pi/4 + constexpr long double constexpr_pi = 3.141592653589793238462643383279502884L; // same as M_PIl in gcc + constexpr long double constexpr_pi_by_2 = 1.570796326794896619231321691639751442L; // same as M_PI_2l in gcc + constexpr long double constexpr_pi_by_4 = 0.785398163397448309615660845819875721L; // same as M_PI_4l in gcc + static_assert( constexpr_pi_by_4 * 4 == constexpr_pi ); + static_assert( constexpr_pi_by_4 * 2 == constexpr_pi_by_2 ); + static_assert( constexpr_pi_by_2 * 2 == constexpr_pi ); // Constexpr implementation of sin for 0 Date: Thu, 29 Feb 2024 22:36:08 +0100 Subject: [PATCH 74/96] [susy2] in susy_gg_tt.sa, use literals for pi constants also in testmisc.cc, as M_PIl is not defined on Mac (previously fixed constexpr_math.h) --- epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) From 3ef6e5669b03caa88844908a633b72492f2377ff Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 22:37:10 +0100 Subject: [PATCH 75/96] [susy2] in CODEGEN backporting susy_gg_tt.sa, use literals for pi constants also in testmisc.cc, as M_PIl is not defined on Mac (previously fixed constexpr_math.h) --- .../madgraph/iolibs/template_files/gpu/testmisc.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) From 4b48614e774e26002c244157f8d66c68b182aad9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 22:37:39 +0100 Subject: [PATCH 76/96] [susy2] regenerate susy_gg_tt.sa, all ok --- .../susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index e0884e66d0..ed01c9bf02 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -582,7 +582,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.137 s +ALOHA: aloha creates 2 routines in 0.134 s VVV1 FFV1 FFV1 @@ -597,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.275s -user 0m1.201s -sys 0m0.061s +real 0m1.285s +user 0m1.186s +sys 0m0.071s Code generation completed in 1 seconds From aee94c364b3ab64a1ad352f1a14ee804eb4f1fb6 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 22:38:47 +0100 Subject: [PATCH 77/96] [susy2] manually fix testmisc.cc in all processes for f in $(git ls-tree --name-only HEAD */SubProcesses/testmisc.cc); do \cp susy_gg_tt.sa/SubProcesses/testmisc.cc $f; done --- epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc | 2 +- epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc index 8c29482e5a..40aa01cc96 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc @@ -340,7 +340,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) << "x=" << x << ", x(0to2Pi)=" << mapIn0to2Pi( x ) << ", istep=" << istep; std::cout << std::setprecision( 6 ); // default }; - testSinCosTanX( M_PIl, 1E-3, true ); // from math.h + testSinCosTanX( constexpr_pi, 1E-3, true ); // from math.h testSinCosTanX( (long double)3.141592653589793238462643383279502884L, 1E-3, true ); // from math.h testSinCosTanX( 4.712388980384687897640105802565813064575L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x==xx) testSinCosTanX( 3 * constexpr_pi_by_2 - 1.96e-15L, 1E-3, true ); // from 100 steps n [-4*pi,6*pi]... succeeds? (note x!=xx) From 26ee2fed04232dd6305b1f4507b2b16eeae22447 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 22:48:21 +0100 Subject: [PATCH 78/96] [susy2] in susy_gg_tt.sa testmisc.cc, increase tolerances for Mac --- epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/susy_gg_tt.sa/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) From 9c1623e13a3f288e9794c7c1a8d0b724e810b83e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 22:49:37 +0100 Subject: [PATCH 79/96] [susy2] in CODEGEN backporting susy_gg_tt.sa testmisc.cc, increase tolerances for Mac --- .../madgraph/iolibs/template_files/gpu/testmisc.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) From ff7c9f6904d70dc853e87e612b0553db88662076 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 22:50:05 +0100 Subject: [PATCH 80/96] [susy2] regenerate susy_gg_tt.sa, all ok --- .../susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt index ed01c9bf02..3d9990af5c 100644 --- a/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt +++ b/epochX/cudacpp/susy_gg_tt.sa/CODEGEN_cudacpp_susy_gg_tt_log.txt @@ -554,7 +554,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.116 s +1 processes with 3 diagrams generated in 0.117 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_susy_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -582,7 +582,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.134 s +ALOHA: aloha creates 2 routines in 0.138 s VVV1 FFV1 FFV1 @@ -597,7 +597,7 @@ INFO: Created files Parameters_MSSM_SLHA2.h and Parameters_MSSM_SLHA2.cc in dire INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_susy_gg_tt/src/. quit -real 0m1.285s -user 0m1.186s -sys 0m0.071s -Code generation completed in 1 seconds +real 0m1.274s +user 0m1.200s +sys 0m0.067s +Code generation completed in 2 seconds From e7f0969dca1d19457fc581c3a4d6a33ac04fcfad Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 29 Feb 2024 22:50:23 +0100 Subject: [PATCH 81/96] [susy2] manually fix testmisc.cc in all processes again for f in $(git ls-tree --name-only HEAD */SubProcesses/testmisc.cc); do \cp susy_gg_tt.sa/SubProcesses/testmisc.cc $f; done --- epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc | 4 ++-- epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc | 4 ++-- 15 files changed, 30 insertions(+), 30 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc index 40aa01cc96..8cbb7b25a3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc @@ -371,8 +371,8 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) testSinCosTan8( 1E-12, 1E-06 ); // fails with 1E-07 testSinCosTan8( 1E-09, 1E-09 ); // fails with 1E-10 testSinCosTan8( 1E-06, 1E-12 ); // fails with 1E-13 - testSinCosTan8( 1E-03, 1E-15 ); // fails with 1E-16 - testSinCosTan8( 1E-02, 1E-99 ); // never fails? always bit-by-bit identical? + testSinCosTan8( 1E-03, 1E-14 ); // fails with 1E-16: could use 1E-14 but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) + testSinCosTan8( 1E-02, 1E-14 ); // never fails? could use 1E-99(?) but keep it at 1E-14 (avoid 'EXPECT_NEAR equivalent to EXPECT_EQUAL' on Mac) // Test constexpr sin, cos, tan - N points almost randomly with a varying tolerance auto testSinCosTanN = [testSinCosTanX, distance4]( const int nstep, const double x0, const double x1 ) From 9601eeaea2944f156a808400e5d8de30d25a84b7 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 1 Mar 2024 08:17:50 +0100 Subject: [PATCH 82/96] [susy2] first, go back to rocrand logs on itscrd90 for easier comparisons git checkout 04e31d7955f2afb7a9ca373f840eeb17487d4ecc tmad/logs_* git checkout c27cb7e5a9ccb00da24fd66d80b16efad01fe6ab tput/logs_* --- .../log_eemumu_mad_d_inl0_hrd0.txt | 402 +++++++++++----- .../log_eemumu_mad_f_inl0_hrd0.txt | 422 ++++++++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 402 +++++++++++----- .../log_ggtt_mad_d_inl0_hrd0.txt | 414 +++++++++++----- .../log_ggtt_mad_f_inl0_hrd0.txt | 418 ++++++++++------ .../log_ggtt_mad_m_inl0_hrd0.txt | 414 +++++++++++----- .../log_ggttg_mad_d_inl0_hrd0.txt | 432 +++++++++++------ .../log_ggttg_mad_f_inl0_hrd0.txt | 432 +++++++++++------ .../log_ggttg_mad_m_inl0_hrd0.txt | 432 +++++++++++------ .../log_ggttgg_mad_d_inl0_hrd0.txt | 432 +++++++++++------ .../log_ggttgg_mad_f_inl0_hrd0.txt | 434 +++++++++++------ .../log_ggttgg_mad_m_inl0_hrd0.txt | 432 +++++++++++------ .../log_ggttggg_mad_d_inl0_hrd0.txt | 422 ++++++++++------ .../log_ggttggg_mad_f_inl0_hrd0.txt | 420 ++++++++++------ .../log_ggttggg_mad_m_inl0_hrd0.txt | 418 ++++++++++------ .../log_gqttq_mad_d_inl0_hrd0.txt | 447 ++++++++++++----- .../log_gqttq_mad_f_inl0_hrd0.txt | 449 ++++++++++++----- .../log_gqttq_mad_m_inl0_hrd0.txt | 453 +++++++++++++----- .../log_eemumu_mad_d_inl0_hrd0.txt | 232 +++++---- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 240 ++++++---- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 215 +++++---- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 235 +++++---- .../log_eemumu_mad_d_inl0_hrd1.txt | 232 +++++---- .../log_eemumu_mad_d_inl1_hrd0.txt | 230 +++++---- .../log_eemumu_mad_d_inl1_hrd1.txt | 230 +++++---- .../log_eemumu_mad_f_inl0_hrd0.txt | 246 ++++++---- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 252 ++++++---- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 229 +++++---- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 247 ++++++---- .../log_eemumu_mad_f_inl0_hrd1.txt | 246 ++++++---- .../log_eemumu_mad_f_inl1_hrd0.txt | 244 ++++++---- .../log_eemumu_mad_f_inl1_hrd1.txt | 244 ++++++---- .../log_eemumu_mad_m_inl0_hrd0.txt | 232 +++++---- .../log_eemumu_mad_m_inl0_hrd1.txt | 232 +++++---- .../log_ggtt_mad_d_inl0_hrd0.txt | 232 +++++---- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 240 ++++++---- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 215 +++++---- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 235 +++++---- .../log_ggtt_mad_d_inl0_hrd1.txt | 232 +++++---- .../log_ggtt_mad_d_inl1_hrd0.txt | 230 +++++---- .../log_ggtt_mad_d_inl1_hrd1.txt | 230 +++++---- .../log_ggtt_mad_f_inl0_hrd0.txt | 250 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 258 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 241 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 253 ++++++---- .../log_ggtt_mad_f_inl0_hrd1.txt | 250 ++++++---- .../log_ggtt_mad_f_inl1_hrd0.txt | 244 ++++++---- .../log_ggtt_mad_f_inl1_hrd1.txt | 244 ++++++---- .../log_ggtt_mad_m_inl0_hrd0.txt | 230 +++++---- .../log_ggtt_mad_m_inl0_hrd1.txt | 230 +++++---- .../log_ggttg_mad_d_inl0_hrd0.txt | 255 +++++----- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 265 +++++----- .../log_ggttg_mad_d_inl0_hrd1.txt | 255 +++++----- .../log_ggttg_mad_f_inl0_hrd0.txt | 269 ++++++----- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 279 ++++++----- .../log_ggttg_mad_f_inl0_hrd1.txt | 269 ++++++----- .../log_ggttg_mad_m_inl0_hrd0.txt | 255 +++++----- .../log_ggttg_mad_m_inl0_hrd1.txt | 255 +++++----- .../log_ggttgg_mad_d_inl0_hrd0.txt | 255 +++++----- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 265 +++++----- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 236 +++++---- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 260 +++++----- .../log_ggttgg_mad_d_inl0_hrd1.txt | 255 +++++----- .../log_ggttgg_mad_d_inl1_hrd0.txt | 257 +++++----- .../log_ggttgg_mad_d_inl1_hrd1.txt | 257 +++++----- .../log_ggttgg_mad_f_inl0_hrd0.txt | 271 ++++++----- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 281 ++++++----- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 260 +++++----- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 276 ++++++----- .../log_ggttgg_mad_f_inl0_hrd1.txt | 271 ++++++----- .../log_ggttgg_mad_f_inl1_hrd0.txt | 275 ++++++----- .../log_ggttgg_mad_f_inl1_hrd1.txt | 275 ++++++----- .../log_ggttgg_mad_m_inl0_hrd0.txt | 251 ++++++---- .../log_ggttgg_mad_m_inl0_hrd1.txt | 251 ++++++---- .../log_ggttggg_mad_d_inl0_hrd0.txt | 255 +++++----- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 265 +++++----- .../log_ggttggg_mad_d_inl0_hrd1.txt | 255 +++++----- .../log_ggttggg_mad_f_inl0_hrd0.txt | 271 ++++++----- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 281 ++++++----- .../log_ggttggg_mad_f_inl0_hrd1.txt | 271 ++++++----- .../log_ggttggg_mad_m_inl0_hrd0.txt | 255 +++++----- .../log_ggttggg_mad_m_inl0_hrd1.txt | 255 +++++----- .../log_gqttq_mad_d_inl0_hrd0.txt | 256 +++++++--- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 268 ++++++++--- .../log_gqttq_mad_d_inl0_hrd1.txt | 256 +++++++--- .../log_gqttq_mad_f_inl0_hrd0.txt | 256 +++++++--- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 268 ++++++++--- .../log_gqttq_mad_f_inl0_hrd1.txt | 256 +++++++--- .../log_gqttq_mad_m_inl0_hrd0.txt | 256 +++++++--- .../log_gqttq_mad_m_inl0_hrd1.txt | 256 +++++++--- 90 files changed, 16135 insertions(+), 9618 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index b608406eb3..5792e7e600 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-02-08_19:35:27 +DATE: 2024-02-05_22:17:58 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5391s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5331s - [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6779s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6698s + [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1350s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1291s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1725s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1641s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.76E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2765s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2141s - [COUNTERS] Fortran MEs ( 1 ) : 0.0624s for 90112 events => throughput is 1.45E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3626s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2752s + [COUNTERS] Fortran MEs ( 1 ) : 0.0874s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,13 +134,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1583s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1523s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1765s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1695s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.17E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000780E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2903s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2247s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0655s for 90112 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3758s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2970s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0788s for 90112 events => throughput is 1.14E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000780E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.419157e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.119401e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.436418e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.150011e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1400s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1365s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1720s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.97E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661518E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2596s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2210s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0387s for 90112 events => throughput is 2.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3238s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2796s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0442s for 90112 events => throughput is 2.04E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.381265e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.960201e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.442923e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.056109e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,13 +286,13 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1388s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1363s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.29E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1691s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1662s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.78E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2483s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2211s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0273s for 90112 events => throughput is 3.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2842s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0333s for 90112 events => throughput is 2.70E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.446000e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.667896e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.550050e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.846362e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1671s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1644s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 2.99E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3086s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2780s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0306s for 90112 events => throughput is 2.95E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.811128e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.083958e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715404661462E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1714s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.35E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661462E-002) differ by less than 3E-14 (7.771561172376096e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3216s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2847s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 90112 events => throughput is 2.44E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.407049e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.476907e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.4196s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4192s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 1.99E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5875s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5870s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661545E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,18 +538,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.5013s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4970s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 90112 events => throughput is 2.08E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7209s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7159s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.82E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -412,43 +560,43 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.190744e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.167668e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.588028e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.968899e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.251687e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.735772e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.876460e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.440392e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.274146e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.710260e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.960127e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.012226e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.226416e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.703491e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.552390e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.128966e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index fe2c61101c..2b4c81420f 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-02-08_19:35:47 +DATE: 2024-02-05_22:18:15 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5107s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5046s - [COUNTERS] Fortran MEs ( 1 ) : 0.0061s for 8192 events => throughput is 1.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6630s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6547s + [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1360s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1301s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1706s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1626s + [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.02E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2782s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2158s - [COUNTERS] Fortran MEs ( 1 ) : 0.0623s for 90112 events => throughput is 1.45E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3655s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2764s + [COUNTERS] Fortran MEs ( 1 ) : 0.0891s for 90112 events => throughput is 1.01E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382701684199335E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382700437610044E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1459s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1407s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 8192 events => throughput is 1.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1794s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1727s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 8192 events => throughput is 1.22E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382701684199335E-002) differ by less than 4E-4 (1.4692721372888684e-07) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700437610044E-002) differ by less than 4E-4 (1.6027646465577305e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515588842633111E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515587669165246E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2790s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2225s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0566s for 90112 events => throughput is 1.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3531s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2813s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0717s for 90112 events => throughput is 1.26E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515588842633111E-002) differ by less than 4E-4 (1.439903947186849e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587669165246E-002) differ by less than 4E-4 (1.568129937012941e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.663270e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.212872e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.674806e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.231007e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382719831741665E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1377s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1356s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.83E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1667s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1641s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.16E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382719831741665E-002) differ by less than 4E-4 (4.740791825774693e-08) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700723828302E-002) differ by less than 4E-4 (1.5721146218172777e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515606481761602E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2429s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2195s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 90112 events => throughput is 3.85E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3047s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2770s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0277s for 90112 events => throughput is 3.25E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515606481761602E-002) differ by less than 4E-4 (4.875410031246474e-08) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587612890761E-002) differ by less than 4E-4 (1.5742791048545257e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.044701e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.211664e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.159562e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.224682e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382719700521907E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1369s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1351s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1660s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1637s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.51E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382719700521907E-002) differ by less than 4E-4 (4.6002735842876064e-08) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700679354239E-002) differ by less than 4E-4 (1.576877179942926e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515606480805645E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2430s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2234s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0196s for 90112 events => throughput is 4.60E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3027s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2781s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 90112 events => throughput is 3.66E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515606480805645E-002) differ by less than 4E-4 (4.874365444607065e-08) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002) differ by less than 4E-4 (1.573566908996682e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.885202e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.652051e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.835303e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1668s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1648s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.02E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700679354239E-002) differ by less than 4E-4 (1.576877179942926e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3014s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2784s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0231s for 90112 events => throughput is 3.90E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002) differ by less than 4E-4 (1.573566908996682e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.831859e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.038589e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.986193e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382704356154977E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1670s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1647s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.71E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382704356154977E-002) differ by less than 4E-4 (1.1831425661412709e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515591292297929E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3074s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2840s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 90112 events => throughput is 3.84E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591292297929E-002) differ by less than 4E-4 (1.172226659074127e-07) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.842306e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.784479e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382704338101225E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.4188s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4185s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.86E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5797s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5793s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.71E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382704338101225E-002) differ by less than 4E-4 (1.1850758729892164e-07) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382706077425631E-002) differ by less than 4E-4 (9.988182347875352e-08) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515591361999701E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.4980s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4951s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 90112 events => throughput is 3.03E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6986s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6939s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 90112 events => throughput is 1.93E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591361999701E-002) differ by less than 4E-4 (1.1646102771045719e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515592892887687E-002) differ by less than 4E-4 (9.973286385633884e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.742092e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.684998e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.171998e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.020092e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.342442e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.026150e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.682729e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.045722e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.347808e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.017663e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.842035e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.235265e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.117791e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.404519e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.921682e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.457162e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 66dad24eb4..41ed20187f 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-02-08_19:36:07 +DATE: 2024-02-05_22:18:32 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5099s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5040s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6567s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6485s + [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1359s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1301s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1705s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1622s + [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2793s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2170s - [COUNTERS] Fortran MEs ( 1 ) : 0.0623s for 90112 events => throughput is 1.45E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3615s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2738s + [COUNTERS] Fortran MEs ( 1 ) : 0.0876s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,13 +134,13 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1437s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1378s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1758s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1688s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.18E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.717646025412023e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.7176438049659737e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2896s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2240s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0656s for 90112 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3634s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0774s for 90112 events => throughput is 1.16E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.412457e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.142475e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.429501e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.160468e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,13 +210,13 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1380s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1345s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1706s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1667s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0038s for 8192 events => throughput is 2.13E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.717646025412023e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.7176438049659737e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2586s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2205s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0381s for 90112 events => throughput is 2.36E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3261s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2820s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 90112 events => throughput is 2.04E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.500136e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.009384e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.529098e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.109517e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,13 +286,13 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1374s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1348s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1690s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1660s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.70E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484913930753692e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2495s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2212s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0282s for 90112 events => throughput is 3.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3099s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2764s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0335s for 90112 events => throughput is 2.69E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.322870e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.614910e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.770588e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1663s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1635s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.93E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3312s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2971s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0342s for 90112 events => throughput is 2.64E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002) differ by less than 2E-4 (2.947131427788463e-11) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.670770e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.423505e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.913155e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1780s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1746s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.40E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2997s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0390s for 90112 events => throughput is 2.31E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002) differ by less than 2E-4 (2.947131427788463e-11) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.342746e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.449709e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715392009222E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.4108s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4103s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.80E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5833s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5828s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.63E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715392009222E-002) differ by less than 2E-4 (1.3548862032308762e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715392009194E-002) differ by less than 2E-4 (1.3548906441229747e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,18 +538,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602021089631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.4993s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4950s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 90112 events => throughput is 2.09E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6969s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6920s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -412,43 +560,43 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602021089631E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.203414e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.201225e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.593725e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.975595e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.287558e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.723786e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.882542e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.507428e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.287734e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.698364e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.955308e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.060413e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.220279e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.736917e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.554716e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.170942e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 7d092ae287..204bcc3c17 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 - +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-02-08_19:36:26 +DATE: 2024-02-05_22:18:49 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.6734s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6448s - [COUNTERS] Fortran MEs ( 1 ) : 0.0287s for 8192 events => throughput is 2.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7649s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7239s + [COUNTERS] Fortran MEs ( 1 ) : 0.0410s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3167s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2881s - [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3910s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3503s + [COUNTERS] Fortran MEs ( 1 ) : 0.0408s for 8192 events => throughput is 2.01E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 + [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3427s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0308s - [COUNTERS] Fortran MEs ( 1 ) : 0.3120s for 90112 events => throughput is 2.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7562s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3060s + [COUNTERS] Fortran MEs ( 1 ) : 0.4502s for 90112 events => throughput is 2.00E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3660s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3340s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4206s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3840s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4147s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0627s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3520s for 90112 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7084s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3082s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4002s for 90112 events => throughput is 2.25E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989099) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.597150e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.211296e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.603334e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.248534e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756619] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3271s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3094s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3938s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3718s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.71E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756619) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989085] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2488s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0540s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1947s for 90112 events => throughput is 4.63E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5214s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2881s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2333s for 90112 events => throughput is 3.86E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989085) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989106) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.756491e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.878640e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.783795e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.806741e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,13 +286,13 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3153s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3050s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0103s for 8192 events => throughput is 7.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3752s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3621s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0131s for 8192 events => throughput is 6.24E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1526s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0398s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1128s for 90112 events => throughput is 7.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4310s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2853s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1457s for 90112 events => throughput is 6.18E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989114) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.276102e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.965700e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.336215e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.009550e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3743s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3628s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0115s for 8192 events => throughput is 7.13E+05 events/s -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.4335s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3051s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1284s for 90112 events => throughput is 7.02E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.031412e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.139328e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3843s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3667s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.67E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.4856s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2935s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1921s for 90112 events => throughput is 4.69E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.481319e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.550589e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,8 +505,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -366,13 +514,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.5915s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5908s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.09E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7810s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7804s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,8 +538,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -399,56 +547,56 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3353s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3276s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7200s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7136s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.337786e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.064090e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.034768e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.746111e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.781902e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.003251e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.755662e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.076004e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.780759e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.006674e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.956840e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.154292e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.756228e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.001053e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.141575e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.043495e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index f04e38a4a5..5482dc0552 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-02-08_19:36:52 +DATE: 2024-02-05_22:19:16 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.5890s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5604s - [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7588s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7181s + [COUNTERS] Fortran MEs ( 1 ) : 0.0407s for 8192 events => throughput is 2.01E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3196s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2910s - [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3951s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3530s + [COUNTERS] Fortran MEs ( 1 ) : 0.0421s for 8192 events => throughput is 1.95E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 + [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3449s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0324s - [COUNTERS] Fortran MEs ( 1 ) : 0.3125s for 90112 events => throughput is 2.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7598s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3088s + [COUNTERS] Fortran MEs ( 1 ) : 0.4511s for 90112 events => throughput is 2.00E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094178241446492] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094177233089695] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3448s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3173s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0275s for 8192 events => throughput is 2.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4164s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3823s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0341s for 8192 events => throughput is 2.40E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094178241446492) differ by less than 4E-4 (1.3934438314322506e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094177233089695) differ by less than 4E-4 (1.6075587627728538e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105686930681671] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105686104543288] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3550s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0532s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3018s for 90112 events => throughput is 2.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2170s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8286s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3884s for 90112 events => throughput is 2.32E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105686930681671) differ by less than 4E-4 (1.7724624157278157e-07) +OK! xsec from fortran (47.105695279989099) and cpp (47.105686104543288) differ by less than 4E-4 (1.9478421364738097e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.108953e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.309794e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.106693e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.307910e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094176373190514] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094173275857273] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3155s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3027s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.39E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3765s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3624s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0141s for 8192 events => throughput is 5.80E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094176373190514) differ by less than 4E-4 (1.7901501314643298e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094173275857273) differ by less than 4E-4 (2.447839242414318e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105685173093654] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105682058834830] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1800s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0392s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1408s for 90112 events => throughput is 6.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4327s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2782s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1545s for 90112 events => throughput is 5.83E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105685173093654) differ by less than 4E-4 (2.1455782361901043e-07) +OK! xsec from fortran (47.105695279989099) and cpp (47.105682058834830) differ by less than 4E-4 (2.8066997403985994e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.477802e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.643949e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.774895e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.734827e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094174474272364] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094171343713690] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3055s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2992s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3633s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3557s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.07E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094174474272364) differ by less than 4E-4 (2.1933672500473733e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094171343713690) differ by less than 4E-4 (2.8581114641657024e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105684585116684] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105681519092386] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1036s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0352s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0685s for 90112 events => throughput is 1.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3622s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2772s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0850s for 90112 events => throughput is 1.06E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105684585116684) differ by less than 4E-4 (2.2703990176786704e-07) +OK! xsec from fortran (47.105695279989099) and cpp (47.105681519092386) differ by less than 4E-4 (2.9212808838607884e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.372076e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.045248e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.049590e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094171343713690] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3614s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3544s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.17E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094171343713690) differ by less than 4E-4 (2.8581114641657024e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105681519092386] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.3569s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2795s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0774s for 90112 events => throughput is 1.16E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989099) and cpp (47.105681519092386) differ by less than 4E-4 (2.9212808838607884e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.147039e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.377066e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.161132e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094178385820562] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3686s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3588s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0098s for 8192 events => throughput is 8.36E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094178385820562) differ by less than 4E-4 (1.3627873807209312e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105688391077187] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.3858s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2782s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1076s for 90112 events => throughput is 8.38E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (47.105695279989099) and cpp (47.105688391077187) differ by less than 4E-4 (1.46243715803962e-07) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.950723e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.149487e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094176770070867] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184344050284] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.5789s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5785s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.34E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7720s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7714s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.51E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094176770070867) differ by less than 4E-4 (1.705876382374072e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184344050284) differ by less than 4E-4 (9.761425112664313e-09) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105687115703695] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105694586476879] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3547s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3510s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 90112 events => throughput is 2.42E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6987s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6934s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 90112 events => throughput is 1.68E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105687115703695) differ by less than 4E-4 (1.733184357144424e-07) +OK! xsec from fortran (47.105695279989099) and cpp (47.105694586476879) differ by less than 4E-4 (1.4722470687011935e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.899433e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.301322e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.182152e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.982927e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.088571e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.833883e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.029875e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.787018e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.086952e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.833466e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.117274e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.876783e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.307240e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.365328e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.512417e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.354802e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 69de55f839..45fc68aa32 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-02-08_19:37:18 +DATE: 2024-02-05_22:19:42 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.6035s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5750s - [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7623s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7212s + [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3193s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2908s - [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3500s + [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 + [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3507s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0370s - [COUNTERS] Fortran MEs ( 1 ) : 0.3137s for 90112 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7512s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3026s + [COUNTERS] Fortran MEs ( 1 ) : 0.4486s for 90112 events => throughput is 2.01E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186141863887] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3553s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3223s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4230s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3861s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 8192 events => throughput is 2.22E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428942997143e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863887) differ by less than 2E-4 (2.841342827686333e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,27 +167,27 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4240s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0604s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3636s for 90112 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7095s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3034s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4061s for 90112 events => throughput is 2.22E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006634) differ by less than 2E-4 (2.865932691165085e-08) +OK! xsec from fortran (47.105695279989099) and cpp (47.105696630006634) differ by less than 2E-4 (2.8659327133695456e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.553748e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.202321e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.547176e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.194899e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186141863908] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186141863887] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3266s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3094s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3881s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3677s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0204s for 8192 events => throughput is 4.02E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094186141863908) differ by less than 2E-4 (2.8413429165041748e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863887) differ by less than 2E-4 (2.841342827686333e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,27 +243,27 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2350s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0459s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1891s for 90112 events => throughput is 4.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5147s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2899s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2248s for 90112 events => throughput is 4.01E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006626) differ by less than 2E-4 (2.8659326689606246e-08) +OK! xsec from fortran (47.105695279989099) and cpp (47.105696630006626) differ by less than 2E-4 (2.8659327133695456e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.825990e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.870020e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.826297e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.915241e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186193208834] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186193208813] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3102s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3002s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3767s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3634s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.19E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094186193208834) differ by less than 2E-4 (2.9503689491505725e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186193208813) differ by less than 2E-4 (2.950368882537191e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696667630852] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105696667630845] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1577s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0471s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1106s for 90112 events => throughput is 8.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4274s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2825s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1450s for 90112 events => throughput is 6.22E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105696667630852) differ by less than 2E-4 (2.9458046002517335e-08) +OK! xsec from fortran (47.105695279989099) and cpp (47.105696667630845) differ by less than 2E-4 (2.945804622456194e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.430058e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.250062e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.098829e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094186193208813] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.5095s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4973s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.76E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094186193208813) differ by less than 2E-4 (2.950368882537191e-08) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105696667630845] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.4106s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2855s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1252s for 90112 events => throughput is 7.20E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989099) and cpp (47.105696667630845) differ by less than 2E-4 (2.945804622456194e-08) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.075524e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.499253e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.434399e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094186193208813] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3845s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3677s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0168s for 8192 events => throughput is 4.88E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094186193208813) differ by less than 2E-4 (2.950368882537191e-08) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105696667630845] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.4742s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2875s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1868s for 90112 events => throughput is 4.82E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989099) and cpp (47.105696667630845) differ by less than 2E-4 (2.945804622456194e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.593121e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.524976e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184798437837] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.5762s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5755s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.12E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7749s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7743s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.41E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184798437837) differ by less than 2E-4 (1.1293943558143837e-10) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184798437830) differ by less than 2E-4 (1.1293987967064822e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,8 +538,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -399,56 +547,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3190s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3117s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0074s for 90112 events => throughput is 1.22E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7352s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7287s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.38E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279068492) differ by less than 2E-4 (1.954369999168648e-11) +OK! xsec from fortran (47.105695279989099) and cpp (47.105695279068492) differ by less than 2E-4 (1.9543477947081556e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.387307e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.068039e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.036460e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.702432e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.800149e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.012365e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.799801e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.064674e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.805211e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.996044e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.006027e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.147754e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.776932e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.022142e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.160521e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.000250e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 31848fff27..d12de30a22 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-02-08_19:37:44 +DATE: 2024-02-05_22:20:10 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.5805s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3781s - [COUNTERS] Fortran MEs ( 1 ) : 0.2024s for 8192 events => throughput is 4.05E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6745s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3479s + [COUNTERS] Fortran MEs ( 1 ) : 0.3266s for 8192 events => throughput is 2.51E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4585s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2560s - [COUNTERS] Fortran MEs ( 1 ) : 0.2025s for 8192 events => throughput is 4.04E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6326s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3084s + [COUNTERS] Fortran MEs ( 1 ) : 0.3242s for 8192 events => throughput is 2.53E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 3.4300s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1993s - [COUNTERS] Fortran MEs ( 1 ) : 2.2307s for 90112 events => throughput is 4.04E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.0816s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5148s + [COUNTERS] Fortran MEs ( 1 ) : 3.5669s for 90112 events => throughput is 2.53E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354763] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.8274s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5459s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2815s for 8192 events => throughput is 2.91E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9448s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6238s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3210s for 8192 events => throughput is 2.55E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354763) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749111) differ by less than 3E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252514E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 4.5819s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4840s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.0979s for 90112 events => throughput is 2.91E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717694E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.3214s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8115s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5099s for 90112 events => throughput is 2.57E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252514E-002) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717694E-002) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.996227e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.636181e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.988570e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.640749e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354515] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5676s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4325s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1351s for 8192 events => throughput is 6.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6387s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4712s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1675s for 8192 events => throughput is 4.89E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354515) differ by less than 3E-14 (2.475797344914099e-14) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607748863) differ by less than 3E-14 (2.453592884421596e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252514E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 2.8126s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3366s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4760s for 90112 events => throughput is 6.11E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717680E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 3.5511s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7126s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8385s for 90112 events => throughput is 4.90E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252514E-002) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717680E-002) differ by less than 3E-14 (5.551115123125783e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.158217e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.072602e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.162344e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.077354e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354763] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3964s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3297s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0667s for 8192 events => throughput is 1.23E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4731s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3897s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0833s for 8192 events => throughput is 9.83E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354763) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 2.0050s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2715s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7335s for 90112 events => throughput is 1.23E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.5092s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5875s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9217s for 90112 events => throughput is 9.78E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252555E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.271134e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.006722e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.926199e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.4548s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3817s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0731s for 8192 events => throughput is 1.12E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.3873s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5789s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8085s for 90112 events => throughput is 1.11E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.125594e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.276682e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.143588e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.5051s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4064s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0987s for 8192 events => throughput is 8.30E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.7499s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6373s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1127s for 90112 events => throughput is 8.10E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.367410e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.383766e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354760] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5698s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5623s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7445s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7391s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.52E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354760) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.5839s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5014s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0825s for 90112 events => throughput is 1.09E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9506s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9280s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0226s for 90112 events => throughput is 3.99E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252555E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.136030e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.632699e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.166900e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.198067e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.675509e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.687085e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.305503e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.245374e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.674372e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.652855e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.839205e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.254650e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.662393e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.685444e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.405471e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.763638e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 60199f2caa..6acca50600 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-02-08_19:38:22 +DATE: 2024-02-05_22:20:53 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4836s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2813s - [COUNTERS] Fortran MEs ( 1 ) : 0.2023s for 8192 events => throughput is 4.05E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6668s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3423s + [COUNTERS] Fortran MEs ( 1 ) : 0.3245s for 8192 events => throughput is 2.52E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4597s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2574s - [COUNTERS] Fortran MEs ( 1 ) : 0.2023s for 8192 events => throughput is 4.05E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3088s + [COUNTERS] Fortran MEs ( 1 ) : 0.3277s for 8192 events => throughput is 2.50E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 3.4250s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1992s - [COUNTERS] Fortran MEs ( 1 ) : 2.2257s for 90112 events => throughput is 4.05E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.1044s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5176s + [COUNTERS] Fortran MEs ( 1 ) : 3.5868s for 90112 events => throughput is 2.51E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112291597608296] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112722327776243] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7637s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5081s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2556s for 8192 events => throughput is 3.21E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9147s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6082s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3065s for 8192 events => throughput is 2.67E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112291597608296) differ by less than 4E-4 (2.5781178285555484e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112722327776243) differ by less than 4E-4 (2.5986973362090993e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239221732791437E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 4.2737s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4606s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.8131s for 90112 events => throughput is 3.20E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238466406484034E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.1640s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8215s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.3425s for 90112 events => throughput is 2.70E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239221732791437E-002) differ by less than 4E-4 (1.8599953477416165e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238466406484034E-002) differ by less than 4E-4 (1.9594309874637617e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.314907e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.825867e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.317579e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.822029e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112290421591680] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112720218188545] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4097s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3346s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4892s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3982s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0910s for 8192 events => throughput is 9.00E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112290421591680) differ by less than 4E-4 (2.6944132867079418e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112720218188545) differ by less than 4E-4 (2.8073040938547678e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239212368085274E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 2.1080s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2816s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8264s for 90112 events => throughput is 1.09E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238450523404405E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.5887s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5903s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9985s for 90112 events => throughput is 9.02E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239212368085274E-002) differ by less than 4E-4 (3.0418222529693395e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238450523404405E-002) differ by less than 4E-4 (3.9638963988952725e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.104565e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.157934e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.091624e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.113656e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112291415112837] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112721286411488] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3273s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2932s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0341s for 8192 events => throughput is 2.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3952s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112291415112837) differ by less than 4E-4 (2.5961646764605106e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721286411488) differ by less than 4E-4 (2.701672777827291e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239211617250407E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.6161s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2416s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3745s for 90112 events => throughput is 2.41E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238449434208005E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.0166s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5446s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4720s for 90112 events => throughput is 1.91E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239211617250407E-002) differ by less than 4E-4 (3.136577692020026e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238449434208005E-002) differ by less than 4E-4 (4.101354408314606e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.461538e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.923701e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.922908e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112721286411488] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.3835s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3461s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0374s for 8192 events => throughput is 2.19E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721286411488) differ by less than 4E-4 (2.701672777827291e-06) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238449434208005E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9524s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5375s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4149s for 90112 events => throughput is 2.17E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238449434208005E-002) differ by less than 4E-4 (4.101354408314606e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.197542e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.467933e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.185435e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112723411062496] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.4079s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3585s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112723411062496) differ by less than 4E-4 (2.491576483576452e-06) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238464401552092E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.0986s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5551s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5435s for 90112 events => throughput is 1.66E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464401552092E-002) differ by less than 4E-4 (2.2124560195013743e-07) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.635897e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.640737e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112292787307366] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112726034625695] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5719s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5699s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7361s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7353s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.77E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112292787307366) differ by less than 4E-4 (2.4604693221741414e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112726034625695) differ by less than 4E-4 (2.2321452151086163e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239222545537072E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.5254s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5013s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0241s for 90112 events => throughput is 3.74E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9238473828077680E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9356s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9261s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 90112 events => throughput is 9.50E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239222545537072E-002) differ by less than 4E-4 (1.7574267630049434e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238473828077680E-002) differ by less than 4E-4 (1.0228161673175862e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.797913e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.324054e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.631443e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.861547e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.467315e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.639674e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.086873e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.427401e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.473805e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.660976e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.637719e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.534637e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.424130e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.507023e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.261612e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.629083e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index d544006f8b..efef9126ba 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-02-08_19:38:58 +DATE: 2024-02-05_22:21:32 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4837s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2813s - [COUNTERS] Fortran MEs ( 1 ) : 0.2024s for 8192 events => throughput is 4.05E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6659s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3410s + [COUNTERS] Fortran MEs ( 1 ) : 0.3248s for 8192 events => throughput is 2.52E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4602s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2576s - [COUNTERS] Fortran MEs ( 1 ) : 0.2026s for 8192 events => throughput is 4.04E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3078s + [COUNTERS] Fortran MEs ( 1 ) : 0.3208s for 8192 events => throughput is 2.55E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 3.4295s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2025s - [COUNTERS] Fortran MEs ( 1 ) : 2.2270s for 90112 events => throughput is 4.05E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.1015s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5259s + [COUNTERS] Fortran MEs ( 1 ) : 3.5756s for 90112 events => throughput is 2.52E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317761225882] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.8252s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5389s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2863s for 8192 events => throughput is 2.86E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6664s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3470s for 8192 events => throughput is 2.36E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317761225882) differ by less than 2E-4 (9.183959592817814e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748700702684) differ by less than 2E-4 (9.191721828116783e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239237217958461E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 4.6463s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4911s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.1552s for 90112 events => throughput is 2.86E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.4987s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8629s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.6359s for 90112 events => throughput is 2.48E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237217958461E-002) differ by less than 2E-4 (9.4234364755863e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482679400354E-002) differ by less than 2E-4 (9.423232416594374e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.920474e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.570856e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.924422e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.544366e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317763556192] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748702805031] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5249s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3897s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1352s for 8192 events => throughput is 6.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6336s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4688s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1649s for 8192 events => throughput is 4.97E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317763556192) differ by less than 2E-4 (9.41440236879032e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748702805031) differ by less than 2E-4 (9.399612643790078e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239237221421968E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 2.8268s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3392s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4876s for 90112 events => throughput is 6.06E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238482683055653E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 3.4637s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6640s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.7997s for 90112 events => throughput is 5.01E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237221421968E-002) differ by less than 2E-4 (9.467145956065792e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482683055653E-002) differ by less than 2E-4 (9.469362849401364e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.219081e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.095748e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.226616e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.111417e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317741957558] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748681415583] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3914s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3258s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0656s for 8192 events => throughput is 1.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4771s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3934s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0837s for 8192 events => throughput is 9.79E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317741957558) differ by less than 2E-4 (7.278528668663853e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415583) differ by less than 2E-4 (7.284515213257237e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239237072275287E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.9923s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2703s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7219s for 90112 events => throughput is 1.25E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238482534347218E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.4900s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5765s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9135s for 90112 events => throughput is 9.86E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237072275287E-002) differ by less than 2E-4 (7.584913142011374e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347218E-002) differ by less than 2E-4 (7.592642958798024e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.280720e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.013857e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.015949e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748681415583] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.4476s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3758s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0718s for 8192 events => throughput is 1.14E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415583) differ by less than 2E-4 (7.284515213257237e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238482534347218E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.3555s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5646s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7909s for 90112 events => throughput is 1.14E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347218E-002) differ by less than 2E-4 (7.592642958798024e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.152174e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.274185e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.139304e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748681415583] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.5114s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4097s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1017s for 8192 events => throughput is 8.06E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415583) differ by less than 2E-4 (7.284515213257237e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238482534347218E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.7719s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6293s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1426s for 90112 events => throughput is 7.89E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347218E-002) differ by less than 2E-4 (7.592642958798024e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.151380e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.222415e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317662375726] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5841s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5766s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7428s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7373s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.50E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317662375726) differ by less than 2E-4 (5.9126292750733e-10) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748601943165) differ by less than 2E-4 (5.74121417074025e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236476482192E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.5790s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4964s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0826s for 90112 events => throughput is 1.09E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9444s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9218s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0226s for 90112 events => throughput is 3.99E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236476482192E-002) differ by less than 2E-4 (6.599809587726213e-11) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481937154381E-002) differ by less than 2E-4 (5.5991211667105745e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.139394e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.637764e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.153627e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.311283e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.672492e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.607274e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.306451e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.233492e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.681234e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.634483e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.840455e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.243748e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.665696e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.619987e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.397776e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.709088e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index bc6d179012..42f6d38589 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-02-08_19:39:36 +DATE: 2024-02-05_22:22:15 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 2.8828s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3925s - [COUNTERS] Fortran MEs ( 1 ) : 2.4903s for 8192 events => throughput is 3.29E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4979s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3422s + [COUNTERS] Fortran MEs ( 1 ) : 4.1556s for 8192 events => throughput is 1.97E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7811s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2733s - [COUNTERS] Fortran MEs ( 1 ) : 2.5077s for 8192 events => throughput is 3.27E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4698s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3331s + [COUNTERS] Fortran MEs ( 1 ) : 4.1367s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 28.9805s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5848s - [COUNTERS] Fortran MEs ( 1 ) : 27.3957s for 90112 events => throughput is 3.29E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 47.6993s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0438s + [COUNTERS] Fortran MEs ( 1 ) : 45.6555s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579739E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 7.7782s - [COUNTERS] Fortran Overhead ( 0 ) : 4.0129s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7654s for 8192 events => throughput is 2.18E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.9166s + [COUNTERS] Fortran Overhead ( 0 ) : 4.5742s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.3424s for 8192 events => throughput is 1.89E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579739E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102372E-004) differ by less than 3E-14 (6.661338147750939e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 46.7387s - [COUNTERS] Fortran Overhead ( 0 ) : 5.2958s - [COUNTERS] CudaCpp MEs ( 2 ) : 41.4429s for 90112 events => throughput is 2.17E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451701E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 54.1949s + [COUNTERS] Fortran Overhead ( 0 ) : 6.2939s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.9009s for 90112 events => throughput is 1.88E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451701E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.239534e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.927146e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.231355e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.934290e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579739E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.5669s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9185s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.6484s for 8192 events => throughput is 4.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7446s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5173s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2273s for 8192 events => throughput is 3.68E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579739E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102372E-004) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 21.3966s - [COUNTERS] Fortran Overhead ( 0 ) : 3.2630s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.1336s for 90112 events => throughput is 4.97E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 28.9914s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2638s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.7275s for 90112 events => throughput is 3.64E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451704E-004) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.089371e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.808325e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.071916e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.810712e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579728E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.6587s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9637s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6951s for 8192 events => throughput is 1.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.2692s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2960s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9732s for 8192 events => throughput is 8.42E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579728E-004) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 9.8396s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2455s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.5942s for 90112 events => throughput is 1.19E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451707E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 13.7356s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0257s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.7099s for 90112 events => throughput is 8.41E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451707E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.228414e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.639623e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.677492e+03 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.0280s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1744s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8536s for 8192 events => throughput is 9.60E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438642451707E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 12.2880s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8878s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4002s for 90112 events => throughput is 9.59E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451707E-004) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.851640e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.227429e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.826048e+03 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.4467s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3864s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0603s for 8192 events => throughput is 7.73E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438642451707E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 14.9115s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1698s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.7417s for 90112 events => throughput is 7.67E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451707E-004) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.829236e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.877772e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579723E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.9692s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8564s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1127s for 8192 events => throughput is 7.27E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8732s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8402s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579723E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914653E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 3.3032s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0640s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2391s for 90112 events => throughput is 7.27E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.9273s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5680s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3593s for 90112 events => throughput is 2.51E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914653E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.344224e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.297744e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.505970e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.523615e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.244620e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.102829e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.041653e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.140866e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.247410e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.114484e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.233850e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.173422e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.239643e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.119855e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.391120e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.426650e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 7b61eaf73e..89070eac21 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-02-08_19:43:24 +DATE: 2024-02-05_22:26:31 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 2.7747s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2754s - [COUNTERS] Fortran MEs ( 1 ) : 2.4993s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4620s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3366s + [COUNTERS] Fortran MEs ( 1 ) : 4.1254s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7635s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2744s - [COUNTERS] Fortran MEs ( 1 ) : 2.4891s for 8192 events => throughput is 3.29E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4826s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3337s + [COUNTERS] Fortran MEs ( 1 ) : 4.1489s for 8192 events => throughput is 1.97E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 29.1495s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5827s - [COUNTERS] Fortran MEs ( 1 ) : 27.5667s for 90112 events => throughput is 3.27E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 47.7647s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0487s + [COUNTERS] Fortran MEs ( 1 ) : 45.7160s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704259755238570E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703728935895570E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 6.9384s - [COUNTERS] Fortran Overhead ( 0 ) : 3.5876s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.3508s for 8192 events => throughput is 2.44E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.1948s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2129s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.9819s for 8192 events => throughput is 2.06E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704259755238570E-004) differ by less than 4E-4 (3.0134411834747965e-06) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703728935895570E-004) differ by less than 4E-4 (3.0081376303225937e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793580182117605E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 41.7441s - [COUNTERS] Fortran Overhead ( 0 ) : 4.8878s - [COUNTERS] CudaCpp MEs ( 2 ) : 36.8562s for 90112 events => throughput is 2.44E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793486223749466E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 49.9434s + [COUNTERS] Fortran Overhead ( 0 ) : 5.9464s + [COUNTERS] CudaCpp MEs ( 2 ) : 43.9970s for 90112 events => throughput is 2.05E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793580182117605E-004) differ by less than 4E-4 (3.024668687290344e-06) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793486223749466E-004) differ by less than 4E-4 (3.0127256538392544e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.501242e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.118265e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.505866e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.118721e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704254541054809E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703721162664038E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.9393s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1027s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8366s for 8192 events => throughput is 9.79E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5421s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4276s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1145s for 8192 events => throughput is 7.35E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704254541054809E-004) differ by less than 4E-4 (2.8787221757475834e-06) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703721162664038E-004) differ by less than 4E-4 (2.8072976823168005e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793578161882866E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 11.6401s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3907s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.2493s for 90112 events => throughput is 9.74E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793482900053113E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 15.5186s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1615s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.3571s for 90112 events => throughput is 7.29E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793578161882866E-004) differ by less than 4E-4 (2.896753368286653e-06) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793482900053113E-004) differ by less than 4E-4 (2.8022777314173908e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.007784e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.661702e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.002571e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.676226e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704254166302247E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703719746039955E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.9898s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6382s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3516s for 8192 events => throughput is 2.33E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3053s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8175s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4878s for 8192 events => throughput is 1.68E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704254166302247E-004) differ by less than 4E-4 (2.8690396836061893e-06) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703719746039955E-004) differ by less than 4E-4 (2.7706958254380964e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793578009696313E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 5.8017s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9237s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8780s for 90112 events => throughput is 2.32E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793482744283897E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 7.9508s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5496s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4012s for 90112 events => throughput is 1.67E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793578009696313E-004) differ by less than 4E-4 (2.887117363403746e-06) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793482744283897E-004) differ by less than 4E-4 (2.7924148244817815e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.410213e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.722931e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.655694e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703719746039955E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 1.1890s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7577s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4314s for 8192 events => throughput is 1.90E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703719746039955E-004) differ by less than 4E-4 (2.7706958254380964e-06) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793482744283897E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 7.1796s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4657s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7139s for 90112 events => throughput is 1.91E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793482744283897E-004) differ by less than 4E-4 (2.7924148244817815e-06) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.965165e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.405195e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.961063e+04 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703728656142196E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 1.3838s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8569s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5269s for 8192 events => throughput is 1.55E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703728656142196E-004) differ by less than 4E-4 (3.0009095357552695e-06) -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793486988396928E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 8.3413s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5676s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.7737s for 90112 events => throughput is 1.56E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793486988396928E-004) differ by less than 4E-4 (3.0611411687697654e-06) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.563090e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.584923e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704261630635685E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703736267486325E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.7970s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7415s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0555s for 8192 events => throughput is 1.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8348s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8135s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0213s for 8192 events => throughput is 3.84E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704261630635685E-004) differ by less than 4E-4 (3.0618958697381515e-06) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703736267486325E-004) differ by less than 4E-4 (3.197566737389579e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793580869662166E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 2.5912s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9808s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6105s for 90112 events => throughput is 1.48E+05 events/s + [XSECTION] Cross section = 0.0001579 [1.5793489323670813E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.7662s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5323s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2339s for 90112 events => throughput is 3.85E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793580869662166E-004) differ by less than 4E-4 (3.0682019858119247e-06) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793489323670813E-004) differ by less than 4E-4 (3.2090047175081793e-06) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.480897e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.598573e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.983179e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.958409e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.715216e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.495046e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.319773e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.667956e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.709955e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.492157e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.074118e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.729344e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.705288e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.485281e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.430844e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.532672e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 294c855cdc..a9f0e03001 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-02-08_19:46:30 +DATE: 2024-02-05_22:29:50 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 2.7711s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2791s - [COUNTERS] Fortran MEs ( 1 ) : 2.4919s for 8192 events => throughput is 3.29E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4716s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3356s + [COUNTERS] Fortran MEs ( 1 ) : 4.1360s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7717s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2746s - [COUNTERS] Fortran MEs ( 1 ) : 2.4971s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4821s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3396s + [COUNTERS] Fortran MEs ( 1 ) : 4.1425s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 29.0020s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5876s - [COUNTERS] Fortran MEs ( 1 ) : 27.4144s for 90112 events => throughput is 3.29E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 48.1896s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0584s + [COUNTERS] Fortran MEs ( 1 ) : 46.1312s for 90112 events => throughput is 1.95E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143272044121E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612659176647E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 7.8731s - [COUNTERS] Fortran Overhead ( 0 ) : 4.0673s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8059s for 8192 events => throughput is 2.15E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.0456s + [COUNTERS] Fortran Overhead ( 0 ) : 4.6365s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.4091s for 8192 events => throughput is 1.86E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143272044121E-004) differ by less than 2E-4 (3.861716058040088e-09) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612659176647E-004) differ by less than 2E-4 (3.851689633904698e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532474032691E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 47.1946s - [COUNTERS] Fortran Overhead ( 0 ) : 5.3591s - [COUNTERS] CudaCpp MEs ( 2 ) : 41.8355s for 90112 events => throughput is 2.15E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438704534937E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 54.9541s + [COUNTERS] Fortran Overhead ( 0 ) : 6.4100s + [COUNTERS] CudaCpp MEs ( 2 ) : 48.5441s for 90112 events => throughput is 1.86E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532474032691E-004) differ by less than 2E-4 (3.933131154099101e-09) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438704534937E-004) differ by less than 2E-4 (3.930950898123342e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.178564e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.919374e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.190429e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.902053e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143304774347E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612692816692E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.4953s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8665s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.6288s for 8192 events => throughput is 5.03E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7464s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5234s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2230s for 8192 events => throughput is 3.69E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143304774347E-004) differ by less than 2E-4 (4.707367828871156e-09) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612692816692E-004) differ by less than 2E-4 (4.720860369289426e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532476698221E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 21.1380s - [COUNTERS] Fortran Overhead ( 0 ) : 3.2192s - [COUNTERS] CudaCpp MEs ( 2 ) : 17.9188s for 90112 events => throughput is 5.03E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438707226032E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 28.6832s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2381s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.4451s for 90112 events => throughput is 3.69E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532476698221E-004) differ by less than 2E-4 (4.101904815811963e-09) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438707226032E-004) differ by less than 2E-4 (4.101344153184527e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.172254e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.801229e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.155646e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.806913e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143287857844E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612675240507E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.6170s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9367s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6803s for 8192 events => throughput is 1.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.2352s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2775s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9577s for 8192 events => throughput is 8.55E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143287857844E-004) differ by less than 2E-4 (4.2702956726259345e-09) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612675240507E-004) differ by less than 2E-4 (4.266737629876616e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532473043530E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 9.7082s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2389s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.4693s for 90112 events => throughput is 1.21E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438703631772E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 13.6830s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0214s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.6615s for 90112 events => throughput is 8.45E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532473043530E-004) differ by less than 2E-4 (3.870500364655527e-09) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438703631772E-004) differ by less than 2E-4 (3.873764864437135e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.222921e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.694548e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.713044e+03 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612675240507E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.0177s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1622s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8555s for 8192 events => throughput is 9.58E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612675240507E-004) differ by less than 2E-4 (4.266737629876616e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438703631772E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 12.2482s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8854s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.3628s for 90112 events => throughput is 9.62E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438703631772E-004) differ by less than 2E-4 (3.873764864437135e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.966213e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.236253e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.798303e+03 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612675240507E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.4852s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4051s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0801s for 8192 events => throughput is 7.58E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612675240507E-004) differ by less than 2E-4 (4.266737629876616e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438703631772E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 14.9006s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1273s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.7733s for 90112 events => throughput is 7.65E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438703631772E-004) differ by less than 2E-4 (3.873764864437135e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.689329e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.748625e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143124638075E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612512203166E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8791s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7656s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1135s for 8192 events => throughput is 7.22E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8748s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8417s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143124638075E-004) differ by less than 2E-4 (5.318190332559425e-11) +OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612512203166E-004) differ by less than 2E-4 (5.4279691852343603e-11) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411887058E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 3.3145s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0695s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2450s for 90112 events => throughput is 7.24E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642387715E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.9385s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5754s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3630s for 90112 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411887058E-004) differ by less than 2E-4 (1.7474910407599964e-12) +OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642387715E-004) differ by less than 2E-4 (4.051647906067046e-12) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.287101e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.285322e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.492219e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.527417e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.245674e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.111025e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.022906e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.154637e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.246083e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.103369e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.231666e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.159233e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.243352e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.097040e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.379631e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.433644e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 885118dbe4..91a0e957d1 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-02-08_19:28:28 +DATE: 2024-02-05_22:35:39 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.6661s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5009s - [COUNTERS] Fortran MEs ( 1 ) : 54.1652s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 95.1303s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4967s + [COUNTERS] Fortran MEs ( 1 ) : 94.6335s for 8192 events => throughput is 8.66E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.6933s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4405s - [COUNTERS] Fortran MEs ( 1 ) : 54.2528s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 94.7777s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4926s + [COUNTERS] Fortran MEs ( 1 ) : 94.2850s for 8192 events => throughput is 8.69E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655972E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 602.4668s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1984s - [COUNTERS] Fortran MEs ( 1 ) : 599.2684s for 90112 events => throughput is 1.50E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1046.4939s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3422s + [COUNTERS] Fortran MEs ( 1 ) : 1042.1517s for 90112 events => throughput is 8.65E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015001E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 176.1915s - [COUNTERS] Fortran Overhead ( 0 ) : 80.7281s - [COUNTERS] CudaCpp MEs ( 2 ) : 95.4634s for 8192 events => throughput is 8.58E+01 events/s + [COUNTERS] PROGRAM TOTAL : 210.0765s + [COUNTERS] Fortran Overhead ( 0 ) : 97.1032s + [COUNTERS] CudaCpp MEs ( 2 ) : 112.9733s for 8192 events => throughput is 7.25E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015001E-006) differ by less than 3E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939193E-006) differ by less than 3E-14 (1.3322676295501878e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085453E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1129.7650s - [COUNTERS] Fortran Overhead ( 0 ) : 82.4810s - [COUNTERS] CudaCpp MEs ( 2 ) : 1047.2841s for 90112 events => throughput is 8.60E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1343.2987s + [COUNTERS] Fortran Overhead ( 0 ) : 100.2236s + [COUNTERS] CudaCpp MEs ( 2 ) : 1243.0751s for 90112 events => throughput is 7.25E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085453E-007) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.5543122344752192e-15) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.033504e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.092994e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.037398e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.445665e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015003E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 81.1983s - [COUNTERS] Fortran Overhead ( 0 ) : 36.6778s - [COUNTERS] CudaCpp MEs ( 2 ) : 44.5205s for 8192 events => throughput is 1.84E+02 events/s + [COUNTERS] PROGRAM TOTAL : 106.7093s + [COUNTERS] Fortran Overhead ( 0 ) : 49.4816s + [COUNTERS] CudaCpp MEs ( 2 ) : 57.2277s for 8192 events => throughput is 1.43E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015003E-006) differ by less than 3E-14 (2.6645352591003757e-15) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.5543122344752192e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085448E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 529.9074s - [COUNTERS] Fortran Overhead ( 0 ) : 39.3238s - [COUNTERS] CudaCpp MEs ( 2 ) : 490.5836s for 90112 events => throughput is 1.84E+02 events/s + [COUNTERS] PROGRAM TOTAL : 686.0903s + [COUNTERS] Fortran Overhead ( 0 ) : 53.2968s + [COUNTERS] CudaCpp MEs ( 2 ) : 632.7935s for 90112 events => throughput is 1.42E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085448E-007) differ by less than 3E-14 (1.3322676295501878e-15) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.7763568394002505e-15) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.262675e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.657733e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.273542e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.659779e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015001E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 35.1935s - [COUNTERS] Fortran Overhead ( 0 ) : 16.0517s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.1418s for 8192 events => throughput is 4.28E+02 events/s + [COUNTERS] PROGRAM TOTAL : 50.2187s + [COUNTERS] Fortran Overhead ( 0 ) : 23.1972s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.0215s for 8192 events => throughput is 3.03E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015001E-006) differ by less than 3E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.1102230246251565e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085445E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656009E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 228.7491s - [COUNTERS] Fortran Overhead ( 0 ) : 18.6295s - [COUNTERS] CudaCpp MEs ( 2 ) : 210.1196s for 90112 events => throughput is 4.29E+02 events/s + [COUNTERS] PROGRAM TOTAL : 322.8676s + [COUNTERS] Fortran Overhead ( 0 ) : 26.8768s + [COUNTERS] CudaCpp MEs ( 2 ) : 295.9908s for 90112 events => throughput is 3.04E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085445E-007) differ by less than 3E-14 (1.1102230246251565e-15) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656009E-007) differ by less than 3E-14 (1.5543122344752192e-15) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.274856e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.617077e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.626995e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 44.2033s + [COUNTERS] Fortran Overhead ( 0 ) : 20.0984s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.1049s for 8192 events => throughput is 3.40E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.1102230246251565e-15) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656009E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 287.4746s + [COUNTERS] Fortran Overhead ( 0 ) : 23.9380s + [COUNTERS] CudaCpp MEs ( 2 ) : 263.5366s for 90112 events => throughput is 3.42E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656009E-007) differ by less than 3E-14 (1.5543122344752192e-15) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.163294e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.276171e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.178912e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 45.5276s + [COUNTERS] Fortran Overhead ( 0 ) : 22.2470s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.2806s for 8192 events => throughput is 3.52E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.1102230246251565e-15) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656009E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 280.9380s + [COUNTERS] Fortran Overhead ( 0 ) : 26.0371s + [COUNTERS] CudaCpp MEs ( 2 ) : 254.9009s for 90112 events => throughput is 3.54E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656009E-007) differ by less than 3E-14 (1.5543122344752192e-15) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.775442e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.798228e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015003E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 11.5004s - [COUNTERS] Fortran Overhead ( 0 ) : 7.6961s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8043s for 8192 events => throughput is 2.15E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.2469s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1621s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0848s for 8192 events => throughput is 7.55E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015003E-006) differ by less than 3E-14 (2.6645352591003757e-15) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.5543122344752192e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085437E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 51.8753s - [COUNTERS] Fortran Overhead ( 0 ) : 10.1190s - [COUNTERS] CudaCpp MEs ( 2 ) : 41.7563s for 90112 events => throughput is 2.16E+03 events/s + [COUNTERS] PROGRAM TOTAL : 18.9307s + [COUNTERS] Fortran Overhead ( 0 ) : 7.0189s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9118s for 90112 events => throughput is 7.56E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085437E-007) differ by less than 3E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.5543122344752192e-15) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.175030e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.516506e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.229111e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.276640e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.552785e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.254877e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.462719e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.570497e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.564096e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.248946e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.521452e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.448571e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.559456e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.255913e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.123633e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.238725e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 70f4cabe46..6e7885c855 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-02-08_20:23:49 +DATE: 2024-02-06_00:00:45 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.4328s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3678s - [COUNTERS] Fortran MEs ( 1 ) : 54.0651s for 8192 events => throughput is 1.52E+02 events/s + [COUNTERS] PROGRAM TOTAL : 95.1870s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4906s + [COUNTERS] Fortran MEs ( 1 ) : 94.6964s for 8192 events => throughput is 8.65E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.4664s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4348s - [COUNTERS] Fortran MEs ( 1 ) : 54.0315s for 8192 events => throughput is 1.52E+02 events/s + [COUNTERS] PROGRAM TOTAL : 94.9685s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4953s + [COUNTERS] Fortran MEs ( 1 ) : 94.4732s for 8192 events => throughput is 8.67E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655972E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 598.2310s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0769s - [COUNTERS] Fortran MEs ( 1 ) : 595.1541s for 90112 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1045.4531s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3853s + [COUNTERS] Fortran MEs ( 1 ) : 1041.0679s for 90112 events => throughput is 8.66E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405363572559468E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405719498009764E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 161.0181s - [COUNTERS] Fortran Overhead ( 0 ) : 73.7447s - [COUNTERS] CudaCpp MEs ( 2 ) : 87.2734s for 8192 events => throughput is 9.39E+01 events/s + [COUNTERS] PROGRAM TOTAL : 191.0542s + [COUNTERS] Fortran Overhead ( 0 ) : 88.4952s + [COUNTERS] CudaCpp MEs ( 2 ) : 102.5590s for 8192 events => throughput is 7.99E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405363572559468E-006) differ by less than 4E-4 (0.00013984863241267576) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405719498009764E-006) differ by less than 4E-4 (0.00013981555433351112) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326080615569212E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326289850060011E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1039.3540s - [COUNTERS] Fortran Overhead ( 0 ) : 76.5015s - [COUNTERS] CudaCpp MEs ( 2 ) : 962.8525s for 90112 events => throughput is 9.36E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1218.1212s + [COUNTERS] Fortran Overhead ( 0 ) : 92.3855s + [COUNTERS] CudaCpp MEs ( 2 ) : 1125.7357s for 90112 events => throughput is 8.00E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326080615569212E-007) differ by less than 4E-4 (0.00014136252059526733) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326289850060011E-007) differ by less than 4E-4 (0.00014135250101854346) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.118262e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.234084e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.120899e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.340763e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405361288903015E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405716133562926E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 39.9885s - [COUNTERS] Fortran Overhead ( 0 ) : 18.0960s - [COUNTERS] CudaCpp MEs ( 2 ) : 21.8925s for 8192 events => throughput is 3.74E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.0072s + [COUNTERS] Fortran Overhead ( 0 ) : 23.3751s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.6321s for 8192 events => throughput is 3.20E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405361288903015E-006) differ by less than 4E-4 (0.0001396645204514435) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405716133562926E-006) differ by less than 4E-4 (0.0001395443151488429) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326076878598447E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326283773234128E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 253.5381s - [COUNTERS] Fortran Overhead ( 0 ) : 20.6367s - [COUNTERS] CudaCpp MEs ( 2 ) : 232.9014s for 90112 events => throughput is 3.87E+02 events/s + [COUNTERS] PROGRAM TOTAL : 310.5028s + [COUNTERS] Fortran Overhead ( 0 ) : 26.9907s + [COUNTERS] CudaCpp MEs ( 2 ) : 283.5121s for 90112 events => throughput is 3.18E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326076878598447E-007) differ by less than 4E-4 (0.00014120229226155523) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326283773234128E-007) differ by less than 4E-4 (0.00014109195015965525) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.660807e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.609202e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.664498e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.624327e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405360895331841E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405715853898719E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 17.8143s - [COUNTERS] Fortran Overhead ( 0 ) : 8.1811s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.6332s for 8192 events => throughput is 8.50E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.1152s + [COUNTERS] Fortran Overhead ( 0 ) : 11.8289s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.2863s for 8192 events => throughput is 6.17E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405360895331841E-006) differ by less than 4E-4 (0.00013963279012663143) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405715853898719E-006) differ by less than 4E-4 (0.00013952176883003098) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326069099562333E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326275792962891E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 115.7443s - [COUNTERS] Fortran Overhead ( 0 ) : 10.8101s - [COUNTERS] CudaCpp MEs ( 2 ) : 104.9342s for 90112 events => throughput is 8.59E+02 events/s + [COUNTERS] PROGRAM TOTAL : 162.6923s + [COUNTERS] Fortran Overhead ( 0 ) : 15.8974s + [COUNTERS] CudaCpp MEs ( 2 ) : 146.7949s for 90112 events => throughput is 6.14E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326069099562333E-007) differ by less than 4E-4 (0.00014086875419705436) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326275792962891E-007) differ by less than 4E-4 (0.00014074978690437057) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.054826e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.205911e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.215076e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.241e-06 [1.2405715853898719E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 22.6386s + [COUNTERS] Fortran Overhead ( 0 ) : 10.6852s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9534s for 8192 events => throughput is 6.85E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405715853898719E-006) differ by less than 4E-4 (0.00013952176883003098) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.333e-07 [2.3326275792962891E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 145.3504s + [COUNTERS] Fortran Overhead ( 0 ) : 14.3703s + [COUNTERS] CudaCpp MEs ( 2 ) : 130.9800s for 90112 events => throughput is 6.88E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326275792962891E-007) differ by less than 4E-4 (0.00014074978690437057) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.145261e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.061844e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.089522e+02 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.241e-06 [1.2405719423038986E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 22.6291s + [COUNTERS] Fortran Overhead ( 0 ) : 11.3109s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.3183s for 8192 events => throughput is 7.24E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405719423038986E-006) differ by less than 4E-4 (0.00013980951024539223) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.333e-07 [2.3326283662420285E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 140.4388s + [COUNTERS] Fortran Overhead ( 0 ) : 15.1102s + [COUNTERS] CudaCpp MEs ( 2 ) : 125.3286s for 90112 events => throughput is 7.19E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326283662420285E-007) differ by less than 4E-4 (0.00014108719888938914) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.551293e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.502865e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405363557292459E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405722175509506E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 6.0668s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2571s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8097s for 8192 events => throughput is 4.53E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5073s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0137s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4936s for 8192 events => throughput is 1.66E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405363557292459E-006) differ by less than 4E-4 (0.00013984740156258724) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405722175509506E-006) differ by less than 4E-4 (0.00014003141235763295) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326074784076956E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326296967941821E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 26.6359s - [COUNTERS] Fortran Overhead ( 0 ) : 6.8241s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.8118s for 90112 events => throughput is 4.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.2651s + [COUNTERS] Fortran Overhead ( 0 ) : 5.8307s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4344s for 90112 events => throughput is 1.66E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326074784076956E-007) differ by less than 4E-4 (0.00014111248645076735) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326296967941821E-007) differ by less than 4E-4 (0.00014165768834106807) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.576856e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.641774e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.546931e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.628707e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.397159e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.317424e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.533190e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.380209e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.399878e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.351711e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.130059e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.398349e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.400049e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.351253e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.094833e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.434374e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index dd4d5d35ae..5311267b6e 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-02-08_21:07:57 +DATE: 2024-02-06_01:04:55 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.4597s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3673s - [COUNTERS] Fortran MEs ( 1 ) : 54.0925s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 95.2652s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4947s + [COUNTERS] Fortran MEs ( 1 ) : 94.7705s for 8192 events => throughput is 8.64E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.5397s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4010s - [COUNTERS] Fortran MEs ( 1 ) : 54.1387s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 94.9824s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4952s + [COUNTERS] Fortran MEs ( 1 ) : 94.4872s for 8192 events => throughput is 8.67E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655972E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 598.2283s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0901s - [COUNTERS] Fortran MEs ( 1 ) : 595.1382s for 90112 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1049.9382s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3738s + [COUNTERS] Fortran MEs ( 1 ) : 1045.5645s for 90112 events => throughput is 8.62E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403629013416990E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985299359846E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 174.1417s - [COUNTERS] Fortran Overhead ( 0 ) : 80.0402s - [COUNTERS] CudaCpp MEs ( 2 ) : 94.1014s for 8192 events => throughput is 8.71E+01 events/s + [COUNTERS] PROGRAM TOTAL : 210.2766s + [COUNTERS] Fortran Overhead ( 0 ) : 96.9104s + [COUNTERS] CudaCpp MEs ( 2 ) : 113.3662s for 8192 events => throughput is 7.23E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629013416990E-006) differ by less than 2E-4 (5.7565425759520394e-09) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985299359846E-006) differ by less than 2E-4 (5.7578810608305275e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783773791503E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1119.8209s - [COUNTERS] Fortran Overhead ( 0 ) : 82.7419s - [COUNTERS] CudaCpp MEs ( 2 ) : 1037.0790s for 90112 events => throughput is 8.69E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1355.4366s + [COUNTERS] Fortran Overhead ( 0 ) : 101.1078s + [COUNTERS] CudaCpp MEs ( 2 ) : 1254.3289s for 90112 events => throughput is 7.18E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783773791503E-007) differ by less than 2E-4 (5.389840573855054e-09) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993212353001E-007) differ by less than 2E-4 (5.389403812117166e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.026356e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.512949e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.036053e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.521314e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403629009850969E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985295828473E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 78.5590s - [COUNTERS] Fortran Overhead ( 0 ) : 35.0840s - [COUNTERS] CudaCpp MEs ( 2 ) : 43.4750s for 8192 events => throughput is 1.88E+02 events/s + [COUNTERS] PROGRAM TOTAL : 109.5011s + [COUNTERS] Fortran Overhead ( 0 ) : 50.7884s + [COUNTERS] CudaCpp MEs ( 2 ) : 58.7127s for 8192 events => throughput is 1.40E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629009850969E-006) differ by less than 2E-4 (5.469044328521022e-09) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985295828473E-006) differ by less than 2E-4 (5.473184350179849e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783784120318E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222645648E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 517.3622s - [COUNTERS] Fortran Overhead ( 0 ) : 37.8419s - [COUNTERS] CudaCpp MEs ( 2 ) : 479.5204s for 90112 events => throughput is 1.88E+02 events/s + [COUNTERS] PROGRAM TOTAL : 699.7664s + [COUNTERS] Fortran Overhead ( 0 ) : 54.5245s + [COUNTERS] CudaCpp MEs ( 2 ) : 645.2418s for 90112 events => throughput is 1.40E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783784120318E-007) differ by less than 2E-4 (5.832704319530535e-09) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222645648E-007) differ by less than 2E-4 (5.8307128014689624e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.373550e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.643172e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.359491e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.637966e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403629007633195E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 33.9565s - [COUNTERS] Fortran Overhead ( 0 ) : 15.2150s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.7415s for 8192 events => throughput is 4.37E+02 events/s + [COUNTERS] PROGRAM TOTAL : 47.5676s + [COUNTERS] Fortran Overhead ( 0 ) : 22.0164s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.5512s for 8192 events => throughput is 3.21E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629007633195E-006) differ by less than 2E-4 (5.290244020628165e-09) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783783946155E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 224.7320s - [COUNTERS] Fortran Overhead ( 0 ) : 17.7872s - [COUNTERS] CudaCpp MEs ( 2 ) : 206.9448s for 90112 events => throughput is 4.35E+02 events/s + [COUNTERS] PROGRAM TOTAL : 305.3712s + [COUNTERS] Fortran Overhead ( 0 ) : 25.6316s + [COUNTERS] CudaCpp MEs ( 2 ) : 279.7396s for 90112 events => throughput is 3.22E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783783946155E-007) differ by less than 2E-4 (5.825236737422301e-09) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.82220427425284e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.612074e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.853680e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.839043e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 41.7281s + [COUNTERS] Fortran Overhead ( 0 ) : 19.1520s + [COUNTERS] CudaCpp MEs ( 2 ) : 22.5761s for 8192 events => throughput is 3.63E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 271.0485s + [COUNTERS] Fortran Overhead ( 0 ) : 23.0363s + [COUNTERS] CudaCpp MEs ( 2 ) : 248.0121s for 90112 events => throughput is 3.63E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.82220427425284e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.395944e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.589380e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.405946e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 44.0281s + [COUNTERS] Fortran Overhead ( 0 ) : 21.6063s + [COUNTERS] CudaCpp MEs ( 2 ) : 22.4218s for 8192 events => throughput is 3.65E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 270.4138s + [COUNTERS] Fortran Overhead ( 0 ) : 25.3779s + [COUNTERS] CudaCpp MEs ( 2 ) : 245.0359s for 90112 events => throughput is 3.68E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.82220427425284e-09) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.938341e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.926685e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628931370709E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 12.2525s - [COUNTERS] Fortran Overhead ( 0 ) : 8.0735s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.1789s for 8192 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5865s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7235s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8630s for 8192 events => throughput is 9.49E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628931370709E-006) differ by less than 2E-4 (8.581571009358413e-10) +OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985217419736E-006) differ by less than 2E-4 (8.480693924894922e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783640044522E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993078576736E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 56.4709s - [COUNTERS] Fortran Overhead ( 0 ) : 10.5806s - [COUNTERS] CudaCpp MEs ( 2 ) : 45.8902s for 90112 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.0732s + [COUNTERS] Fortran Overhead ( 0 ) : 6.5735s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4997s for 90112 events => throughput is 9.49E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783640044522E-007) differ by less than 2E-4 (3.447657714872321e-10) +OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993078576736E-007) differ by less than 2E-4 (3.4640645907302314e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.979939e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.407818e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.998270e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.084506e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.330145e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.114502e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.387574e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.160968e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.328512e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.107573e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.245538e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.113991e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.317098e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.111189e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.085726e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.632319e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 1e31973081..3e31dbe95a 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-02-08_19:50:18 +DATE: 2024-02-05_22:34:08 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4530s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4049s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4576s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3875s + [COUNTERS] Fortran MEs ( 1 ) : 0.0701s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3084s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2603s - [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3843s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3144s + [COUNTERS] Fortran MEs ( 1 ) : 0.0699s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7392s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2138s - [COUNTERS] Fortran MEs ( 1 ) : 0.5253s for 90112 events => throughput is 1.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2820s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5165s + [COUNTERS] Fortran MEs ( 1 ) : 0.7655s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4186s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3493s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0693s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4725s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3960s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0765s for 8192 events => throughput is 1.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166922) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263335) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343820] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561287] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0585s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2942s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7643s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4597s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6104s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8493s for 90112 events => throughput is 1.06E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343820) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561287) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.208650e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.071916e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.207106e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.066839e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166122] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351262536] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3310s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2978s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 8192 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3995s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3576s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0419s for 8192 events => throughput is 1.96E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166122) differ by less than 3E-14 (2.9531932455029164e-14) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351262536) differ by less than 3E-14 (2.930988785010413e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6148s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2494s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3654s for 90112 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0138s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5775s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4364s for 90112 events => throughput is 2.07E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343823) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561290) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.506508e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.048379e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.463460e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.053491e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2986s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2814s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3649s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3418s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.56E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166922) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4401s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2494s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1908s for 90112 events => throughput is 4.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8252s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5671s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2581s for 90112 events => throughput is 3.49E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343823) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561295) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.791511e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.435366e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.864406e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.453652e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3613s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3408s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0205s for 8192 events => throughput is 3.99E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.7846s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5597s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2249s for 90112 events => throughput is 4.01E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561295) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.968141e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.963827e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3772s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3473s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0299s for 8192 events => throughput is 2.74E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8834s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5621s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3214s for 90112 events => throughput is 2.80E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561295) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.713679e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.652984e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539351263352] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7431s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7425s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263352) differ by less than 3E-14 (8.881784197001252e-16) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686556561298] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.9618s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9543s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.21E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561298) differ by less than 3E-14 (4.440892098500626e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.577372e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.168278e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.398056e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.505593e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.390610e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.767356e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.391773e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.780303e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 3529f52591..9909d81694 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-02-08_19:50:40 +DATE: 2024-02-05_22:34:38 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3537s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3056s - [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4477s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3778s + [COUNTERS] Fortran MEs ( 1 ) : 0.0699s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3119s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2638s - [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3823s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3126s + [COUNTERS] Fortran MEs ( 1 ) : 0.0698s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7486s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2222s - [COUNTERS] Fortran MEs ( 1 ) : 0.5263s for 90112 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2711s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5092s + [COUNTERS] Fortran MEs ( 1 ) : 0.7619s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110149549279866] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110461852325612] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3774s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3210s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0563s for 8192 events => throughput is 1.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4797s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4071s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0727s for 8192 events => throughput is 1.13E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110149549279866) differ by less than 4E-4 (2.840326210895583e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110461852325612) differ by less than 4E-4 (2.8586276618058903e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510678843355344] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510685241079500] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8952s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2754s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6199s for 90112 events => throughput is 1.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3864s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6235s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7628s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510678843355344) differ by less than 4E-4 (4.2350520312872675e-08) +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510685241079500) differ by less than 4E-4 (6.11548025553077e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.480352e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.215484e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.487866e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.198326e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110146988852984] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110456793177945] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3050s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2850s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0200s for 8192 events => throughput is 4.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3638s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3404s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.50E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110146988852984) differ by less than 4E-4 (2.934771267448788e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110456793177945) differ by less than 4E-4 (3.0452395031188573e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510676993136629] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510681375304044] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4630s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2419s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2211s for 90112 events => throughput is 4.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8134s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5557s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2577s for 90112 events => throughput is 3.50E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510676993136629) differ by less than 4E-4 (1.2836447871311663e-07) +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510681375304044) differ by less than 4E-4 (2.408689854238588e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.182134e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.485750e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.303630e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.433033e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110148793566186] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110458350871136] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2826s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2731s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 8192 events => throughput is 8.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3444s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3319s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.55E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110148793566186) differ by less than 4E-4 (2.8682018052839098e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110458350871136) differ by less than 4E-4 (2.987782395047489e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510676419088856] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510680866622453] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.3319s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2267s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1052s for 90112 events => throughput is 8.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6853s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5504s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1349s for 90112 events => throughput is 6.68E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510676419088856) differ by less than 4E-4 (1.5505111905511626e-07) +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510680866622453) differ by less than 4E-4 (2.6451684009831666e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.765258e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.565170e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.816790e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.672553e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110458350871136] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3399s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3287s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.35E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110458350871136) differ by less than 4E-4 (2.987782395047489e-06) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510680866622453] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.7807s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6487s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1319s for 90112 events => throughput is 6.83E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510680866622453) differ by less than 4E-4 (2.6451684009831666e-07) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.899301e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.086975e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110464176080312] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3525s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3371s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0155s for 8192 events => throughput is 5.29E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110464176080312) differ by less than 4E-4 (2.772913590631809e-06) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510685411522340] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.7160s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5505s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1654s for 90112 events => throughput is 5.45E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510685411522340) differ by less than 4E-4 (5.3231167029821336e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.309335e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.077737e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110478167944563] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7440s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7434s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.55E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110478167944563) differ by less than 4E-4 (2.2568093527297606e-06) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510689885789416] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.9615s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9556s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.54E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510689885789416) differ by less than 4E-4 (1.547708909921397e-07) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.681067e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.455476e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.793086e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.723997e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.841062e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.798906e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.394007e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.992718e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 7d9cc4ceb3..d0f21b96dd 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-02-08_19:51:00 +DATE: 2024-02-05_22:35:08 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3552s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3070s - [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4458s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3754s + [COUNTERS] Fortran MEs ( 1 ) : 0.0704s for 8192 events => throughput is 1.16E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3113s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2631s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3835s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3139s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7438s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2181s - [COUNTERS] Fortran MEs ( 1 ) : 0.5256s for 90112 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2916s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5236s + [COUNTERS] Fortran MEs ( 1 ) : 0.7681s for 90112 events => throughput is 1.17E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226549005623] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539348915991] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4037s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3344s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0692s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4710s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3945s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0765s for 8192 events => throughput is 1.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226549005623) differ by less than 2E-4 (7.972267290767832e-11) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348915991) differ by less than 2E-4 (8.658396222216425e-11) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679758658835] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0503s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2898s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7605s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4876s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6294s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8583s for 90112 events => throughput is 1.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679758658835) differ by less than 2E-4 (2.0059864880295208e-10) +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686560794334) differ by less than 2E-4 (1.967879192932287e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.201729e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.023469e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.203533e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.065490e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226549005628] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3288s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2961s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0327s for 8192 events => throughput is 2.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3962s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3571s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0391s for 8192 events => throughput is 2.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226549005628) differ by less than 2E-4 (7.972245086307339e-11) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348916002) differ by less than 2E-4 (8.658362915525686e-11) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679758658832] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6141s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2546s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3595s for 90112 events => throughput is 2.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0102s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5782s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4320s for 90112 events => throughput is 2.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679758658832) differ by less than 2E-4 (2.0059842675834716e-10) +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686560794337) differ by less than 2E-4 (1.9678814133783362e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.516035e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.046626e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.520975e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.000567e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226530029391] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2991s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2817s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0174s for 8192 events => throughput is 4.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3734s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3500s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.51E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226530029391) differ by less than 2E-4 (7.796884249344771e-10) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679756340242] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686558551750] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4227s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2332s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1895s for 90112 events => throughput is 4.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8128s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5602s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2526s for 90112 events => throughput is 3.57E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679756340242) differ by less than 2E-4 (9.281064805577444e-11) +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686558551750) differ by less than 2E-4 (9.2533536388828e-11) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.844894e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.546873e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.853378e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.489419e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3598s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3400s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0198s for 8192 events => throughput is 4.14E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686558551750] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.7728s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5560s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2168s for 90112 events => throughput is 4.16E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686558551750) differ by less than 2E-4 (9.2533536388828e-11) -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.091661e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.136379e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3802s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3498s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0304s for 8192 events => throughput is 2.69E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686558551750] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.9114s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5781s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3333s for 90112 events => throughput is 2.70E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686558551750) differ by less than 2E-4 (9.2533536388828e-11) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.656196e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.642304e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539343558532] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7437s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7430s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539343558532) differ by less than 2E-4 (2.8419933073564607e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.9687s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9611s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 90112 events => throughput is 1.19E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686553631395) differ by less than 2E-4 (1.3620649053081024e-10) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.582708e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.992203e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.392597e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.507161e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.373790e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.765106e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.396492e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.772367e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index ace66bb6ed..bfc258d00c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_18:16:31 +DATE: 2024-02-05_21:03:08 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.334674e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.108654e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.338540e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 1.109691 sec - 1,247,687,096 cycles:u # 1.072 GHz (74.94%) - 2,278,286 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.54%) - 5,704,503 stalled-cycles-backend:u # 0.46% backend cycles idle (75.11%) - 2,089,277,539 instructions:u # 1.67 insn per cycle - # 0.00 stalled cycles per insn (76.16%) - 1.704153461 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.452350e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.294511e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.148428e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.822610 sec + 2,837,805,632 cycles # 3.005 GHz + 4,403,230,931 instructions # 1.55 insn per cycle + 1.161334861 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.127928e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.288471e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.288471e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.367604 sec - 19,491,485,206 cycles:u # 3.046 GHz (75.00%) - 51,794,140 stalled-cycles-frontend:u # 0.27% frontend cycles idle (75.00%) - 47,658,866 stalled-cycles-backend:u # 0.24% backend cycles idle (75.00%) - 47,117,293,535 instructions:u # 2.42 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 6.401683865 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.064600e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.236923e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.236923e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.301517 sec + 19,469,535,699 cycles # 3.088 GHz + 46,932,585,474 instructions # 2.41 insn per cycle + 6.315848009 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.745857e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.192455e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.192455e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.398930 sec - 13,221,919,735 cycles:u # 2.985 GHz (74.90%) - 52,263,631 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.97%) - 1,044,297,541 stalled-cycles-backend:u # 7.90% backend cycles idle (75.06%) - 31,151,962,295 instructions:u # 2.36 insn per cycle - # 0.03 stalled cycles per insn (75.08%) - 4.433514520 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.670155e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.187482e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.187482e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.157075 sec + 12,815,071,753 cycles # 3.079 GHz + 31,183,530,054 instructions # 2.43 insn per cycle + 4.175981498 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.408381e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.196444e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.196444e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.397303 sec - 10,167,564,191 cycles:u # 2.966 GHz (74.91%) - 51,874,302 stalled-cycles-frontend:u # 0.51% frontend cycles idle (75.00%) - 436,450,232 stalled-cycles-backend:u # 4.29% backend cycles idle (75.03%) - 19,342,752,766 instructions:u # 1.90 insn per cycle - # 0.02 stalled cycles per insn (75.03%) - 3.431966834 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.043386e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.866643e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.866643e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.480200 sec + 10,038,614,023 cycles # 2.880 GHz + 19,479,866,151 instructions # 1.94 insn per cycle + 3.497192381 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.206412e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.180520e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.180520e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.242937 sec + 9,601,951,480 cycles # 2.956 GHz + 18,942,365,265 instructions # 1.97 insn per cycle + 3.260567136 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.987065e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.720633e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.720633e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.565006 sec + 8,159,998,179 cycles # 2.285 GHz + 15,511,778,574 instructions # 1.90 insn per cycle + 3.584683969 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index a02e8efa0d..65910bb431 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,175 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_19:08:08 +DATE: 2024-02-05_21:54:50 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.485774e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.352334e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.352334e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.544822 sec - 18,357,049,736 cycles:u # 3.290 GHz (75.00%) - 121,804,388 stalled-cycles-frontend:u # 0.66% frontend cycles idle (75.00%) - 6,984,637,879 stalled-cycles-backend:u # 38.05% backend cycles idle (74.99%) - 17,123,769,634 instructions:u # 0.93 insn per cycle - # 0.41 stalled cycles per insn (75.06%) - 5.607290014 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.680837e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.566605e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.566605e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.218628 sec + 7,529,157,154 cycles # 3.046 GHz + 13,238,923,769 instructions # 1.76 insn per cycle + 2.530894761 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.234311e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.409182e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.409182e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.942439 sec - 19,893,415,201 cycles:u # 3.324 GHz (75.01%) - 52,113,264 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.02%) - 124,246,256 stalled-cycles-backend:u # 0.62% backend cycles idle (75.01%) - 47,361,048,369 instructions:u # 2.38 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 5.987785762 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.019513e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.177027e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.177027e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.764382 sec + 20,695,393,450 cycles # 3.057 GHz + 47,159,570,161 instructions # 2.28 insn per cycle + 6.772090237 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.868611e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.336418e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.336418e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.242379 sec - 13,941,244,348 cycles:u # 3.253 GHz (75.00%) - 53,854,319 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.00%) - 983,287,187 stalled-cycles-backend:u # 7.05% backend cycles idle (74.99%) - 31,985,098,389 instructions:u # 2.29 insn per cycle - # 0.03 stalled cycles per insn (74.99%) - 4.292601985 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.579917e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.036956e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.036956e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.580559 sec + 14,078,384,423 cycles # 3.069 GHz + 32,025,612,143 instructions # 2.27 insn per cycle + 4.588192092 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.533894e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.319888e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.319888e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.369509 sec - 10,904,824,724 cycles:u # 3.196 GHz (74.97%) - 51,264,642 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.90%) - 495,017,962 stalled-cycles-backend:u # 4.54% backend cycles idle (74.92%) - 20,691,138,731 instructions:u # 1.90 insn per cycle - # 0.02 stalled cycles per insn (74.97%) - 3.416560925 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.968228e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.695210e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.695210e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.809898 sec + 11,283,308,654 cycles # 2.957 GHz + 20,842,408,471 instructions # 1.85 insn per cycle + 3.817347229 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.041776e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.843910e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.843910e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.698169 sec + 10,822,932,969 cycles # 2.922 GHz + 20,302,447,935 instructions # 1.88 insn per cycle + 3.705544033 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.801708e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.413060e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413060e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.119290 sec + 9,498,000,250 cycles # 2.302 GHz + 16,663,815,127 instructions # 1.75 insn per cycle + 4.126857918 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index c001fa8fed..4ae3af74cc 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,165 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_19:19:38 +DATE: 2024-02-05_22:08:05 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.205851e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.102555e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.330080e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.480206e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.597414e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.138890e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.665145 sec - 15,379,655,035 cycles:u # 3.280 GHz (74.93%) - 54,006,110 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.85%) - 6,900,380,307 stalled-cycles-backend:u # 44.87% backend cycles idle (74.99%) - 11,488,247,444 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (75.13%) - 4.716396464 seconds time elapsed +TOTAL : 1.306385 sec + 4,669,803,273 cycles # 3.040 GHz + 7,258,024,375 instructions # 1.55 insn per cycle + 1.593330168 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.249184e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.428251e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.428251e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.064905e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.237097e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.237097e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.763908 sec - 19,510,803,462 cycles:u # 3.368 GHz (75.01%) - 51,399,710 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.01%) - 63,104,773 stalled-cycles-backend:u # 0.32% backend cycles idle (75.01%) - 47,052,151,639 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 5.796432681 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.649438 sec + 20,601,461,815 cycles # 3.096 GHz + 47,036,177,622 instructions # 2.28 insn per cycle + 6.655735612 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.931734e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.436512e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.436512e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.642258e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.154032e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.154032e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.982640 sec - 13,306,793,894 cycles:u # 3.317 GHz (74.89%) - 52,522,331 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.90%) - 1,016,515,877 stalled-cycles-backend:u # 7.64% backend cycles idle (74.99%) - 31,095,573,865 instructions:u # 2.34 insn per cycle - # 0.03 stalled cycles per insn (75.08%) - 4.014845336 seconds time elapsed +TOTAL : 4.581869 sec + 13,918,566,464 cycles # 3.040 GHz + 31,189,830,309 instructions # 2.24 insn per cycle + 4.588050359 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.641894e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.520503e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.520503e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.094130e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.941483e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.941483e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.118388 sec - 10,246,999,910 cycles:u # 3.255 GHz (74.84%) - 49,473,514 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.92%) - 443,528,455 stalled-cycles-backend:u # 4.33% backend cycles idle (75.04%) - 19,336,467,898 instructions:u # 1.89 insn per cycle - # 0.02 stalled cycles per insn (75.10%) - 3.150560934 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +TOTAL : 3.749910 sec + 11,151,948,172 cycles # 2.970 GHz + 19,381,078,189 instructions # 1.74 insn per cycle + 3.755953575 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.184182e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.148835e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.148835e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.634201 sec + 10,721,613,182 cycles # 2.946 GHz + 18,644,581,380 instructions # 1.74 insn per cycle + 3.640332735 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.014889e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.782541e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.782541e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.873293 sec + 9,338,000,522 cycles # 2.408 GHz + 15,212,575,344 instructions # 1.63 insn per cycle + 3.879430539 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 78a5f84dc3..f143b0d07e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,169 +1,212 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_19:15:53 +DATE: 2024-02-05_22:01:32 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.447010e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.077449e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.305009e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.410601 sec - 17,946,066,362 cycles:u # 3.296 GHz (75.03%) - 120,567,509 stalled-cycles-frontend:u # 0.67% frontend cycles idle (75.04%) - 6,888,104,059 stalled-cycles-backend:u # 38.38% backend cycles idle (75.03%) - 16,750,524,596 instructions:u # 0.93 insn per cycle - # 0.41 stalled cycles per insn (75.01%) - 5.466546820 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.185742e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.554312e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.047778e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 1.839986 sec + 6,325,475,989 cycles # 3.058 GHz + 11,573,800,558 instructions # 1.83 insn per cycle + 2.127438997 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.249236e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.428795e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.428795e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.769202 sec - 19,521,833,039 cycles:u # 3.365 GHz (74.93%) - 50,351,159 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.00%) - 59,959,935 stalled-cycles-backend:u # 0.31% backend cycles idle (75.04%) - 47,022,405,845 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 5.803333520 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.062149e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.235118e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.235118e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.314728 sec + 19,513,325,065 cycles # 3.088 GHz + 46,932,457,987 instructions # 2.41 insn per cycle + 6.321008534 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.935766e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.436965e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.436965e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.980612 sec - 13,283,927,789 cycles:u # 3.311 GHz (74.88%) - 52,410,542 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.90%) - 985,416,242 stalled-cycles-backend:u # 7.42% backend cycles idle (75.00%) - 31,113,123,354 instructions:u # 2.34 insn per cycle - # 0.03 stalled cycles per insn (75.08%) - 4.014719150 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.678056e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.200158e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.200158e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.138227 sec + 12,805,005,540 cycles # 3.091 GHz + 31,182,633,145 instructions # 2.44 insn per cycle + 4.144327243 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.663450e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.544894e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.544894e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.095489 sec - 10,131,189,853 cycles:u # 3.240 GHz (74.93%) - 48,075,304 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.95%) - 459,616,019 stalled-cycles-backend:u # 4.54% backend cycles idle (74.95%) - 19,408,429,710 instructions:u # 1.92 insn per cycle - # 0.02 stalled cycles per insn (74.93%) - 3.129267263 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.103740e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.946386e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.946386e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.380556 sec + 10,029,667,841 cycles # 2.964 GHz + 19,479,253,229 instructions # 1.94 insn per cycle + 3.387379331 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.148197e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.084864e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.084864e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.329151 sec + 9,564,427,384 cycles # 2.869 GHz + 18,942,155,298 instructions # 1.98 insn per cycle + 3.335198952 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.991549e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.749556e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.749556e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.557586 sec + 8,179,849,242 cycles # 2.297 GHz + 15,511,241,799 instructions # 1.90 insn per cycle + 3.563627106 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 6410590fe2..6a3f1ceed7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_18:17:02 +DATE: 2024-02-05_21:03:44 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.910052e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.591643e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.913749e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.537799 sec - 1,246,252,190 cycles:u # 2.313 GHz (73.78%) - 2,375,538 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.48%) - 5,063,837 stalled-cycles-backend:u # 0.41% backend cycles idle (75.18%) - 2,025,449,879 instructions:u # 1.63 insn per cycle - # 0.00 stalled cycles per insn (74.67%) - 0.588688986 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.454088e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.310484e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.178306e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.688204 sec + 2,788,161,286 cycles # 3.018 GHz + 4,313,930,533 instructions # 1.55 insn per cycle + 1.004609512 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/CUDA) = 1.2828039868165206E-002 +Relative difference = 1.027708011645137e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.189137e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.367378e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.367378e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.063807 sec - 18,518,409,022 cycles:u # 3.040 GHz (74.93%) - 52,837,559 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.93%) - 49,068,178 stalled-cycles-backend:u # 0.26% backend cycles idle (75.00%) - 44,809,533,231 instructions:u # 2.42 insn per cycle - # 0.00 stalled cycles per insn (75.06%) - 6.095541198 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 485) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.132282e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.329017e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.329017e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.949222 sec + 18,388,946,227 cycles # 3.089 GHz + 44,715,744,739 instructions # 2.43 insn per cycle + 5.964000663 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.815969e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.308592e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.308592e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.252278 sec - 12,725,814,930 cycles:u # 2.971 GHz (74.98%) - 52,143,435 stalled-cycles-frontend:u # 0.41% frontend cycles idle (74.98%) - 92,946,219 stalled-cycles-backend:u # 0.73% backend cycles idle (74.99%) - 30,141,549,743 instructions:u # 2.37 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 4.287489156 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.733678e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.296638e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.296638e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.018712 sec + 12,430,727,525 cycles # 3.089 GHz + 30,107,925,252 instructions # 2.42 insn per cycle + 4.041024093 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.349782e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.096958e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.096958e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.462220 sec - 10,313,532,563 cycles:u # 2.952 GHz (74.88%) - 47,667,709 stalled-cycles-frontend:u # 0.46% frontend cycles idle (74.97%) - 297,463,767 stalled-cycles-backend:u # 2.88% backend cycles idle (75.04%) - 19,040,300,630 instructions:u # 1.85 insn per cycle - # 0.02 stalled cycles per insn (75.04%) - 3.497579708 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1884) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.056780e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.883541e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.883541e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.453635 sec + 10,178,670,600 cycles # 2.943 GHz + 19,115,449,328 instructions # 1.88 insn per cycle + 3.483429962 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.258262e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.271969e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.271969e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.174579 sec + 9,422,090,303 cycles # 2.963 GHz + 18,488,534,290 instructions # 1.96 insn per cycle + 3.194716512 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1576) (512y: 159) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.415665e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.623650e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.623650e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.995764 sec + 7,229,078,869 cycles # 2.409 GHz + 13,863,533,911 instructions # 1.92 insn per cycle + 3.021066220 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 818) (512y: 57) (512z: 898) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 7bd9ae25ca..ace759a2cd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_18:49:03 +DATE: 2024-02-05_21:43:48 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.313530e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.103673e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.331803e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.502044 sec - 1,327,766,940 cycles:u # 2.541 GHz (74.23%) - 2,288,038 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.26%) - 5,806,237 stalled-cycles-backend:u # 0.44% backend cycles idle (74.94%) - 2,064,482,988 instructions:u # 1.55 insn per cycle - # 0.00 stalled cycles per insn (75.49%) - 0.552230357 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.475865e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.604505e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.146221e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.701054 sec + 2,730,306,219 cycles # 3.021 GHz + 4,260,051,145 instructions # 1.56 insn per cycle + 0.996046541 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.760335e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.143552e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.143552e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.299727 sec - 14,264,485,997 cycles:u # 3.295 GHz (74.94%) - 51,585,853 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.03%) - 516,456,100 stalled-cycles-backend:u # 3.62% backend cycles idle (75.06%) - 36,766,031,380 instructions:u # 2.58 insn per cycle - # 0.01 stalled cycles per insn (75.06%) - 4.332006754 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.458258e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.800445e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.800445e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.703376 sec + 14,587,622,722 cycles # 3.099 GHz + 36,695,900,493 instructions # 2.52 insn per cycle + 4.709688335 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.396616e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.209079e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.209079e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.346787 sec - 10,977,650,608 cycles:u # 3.252 GHz (74.78%) - 52,064,364 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.95%) - 68,767,635 stalled-cycles-backend:u # 0.63% backend cycles idle (75.07%) - 24,719,153,812 instructions:u # 2.25 insn per cycle - # 0.00 stalled cycles per insn (75.12%) - 3.380096646 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.110786e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.028102e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.028102e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.372756 sec + 10,352,627,181 cycles # 3.065 GHz + 24,753,157,000 instructions # 2.39 insn per cycle + 3.378997727 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.966819e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.124832e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.124832e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.849245 sec - 9,188,509,383 cycles:u # 3.192 GHz (75.02%) - 48,362,362 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.99%) - 514,786,293 stalled-cycles-backend:u # 5.60% backend cycles idle (74.99%) - 16,821,404,609 instructions:u # 1.83 insn per cycle - # 0.03 stalled cycles per insn (74.99%) - 2.882688928 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.404635e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.589571e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.589571e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.006552 sec + 8,879,177,800 cycles # 2.948 GHz + 16,954,648,410 instructions # 1.91 insn per cycle + 3.013416619 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.558616e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.983086e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.983086e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.859667 sec + 8,322,312,903 cycles # 2.905 GHz + 16,297,913,029 instructions # 1.96 insn per cycle + 2.866233363 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2403) (512y: 292) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.182801e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.123196e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.123196e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.275855 sec + 7,741,160,272 cycles # 2.360 GHz + 14,352,612,145 instructions # 1.85 insn per cycle + 3.282280773 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 892) (512y: 63) (512z: 975) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index fce8cfaf0d..462a523e87 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_18:49:27 +DATE: 2024-02-05_21:44:19 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.942345e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.599595e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.922004e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.486783 sec - 1,253,643,250 cycles:u # 2.466 GHz (75.00%) - 2,246,551 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.98%) - 5,266,359 stalled-cycles-backend:u # 0.42% backend cycles idle (74.91%) - 1,994,045,893 instructions:u # 1.59 insn per cycle - # 0.00 stalled cycles per insn (74.77%) - 0.536081700 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.483285e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.624302e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.203253e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.661912 sec + 2,759,005,929 cycles # 3.039 GHz + 4,269,523,592 instructions # 1.55 insn per cycle + 0.969846459 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/CUDA) = 1.2828039868165206E-002 +Relative difference = 1.027708011645137e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.414536e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.195615e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.195615e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.329056 sec - 10,883,365,540 cycles:u # 3.241 GHz (74.99%) - 50,913,166 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.99%) - 65,828,742 stalled-cycles-backend:u # 0.60% backend cycles idle (74.99%) - 28,550,737,411 instructions:u # 2.62 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 3.361278232 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.050399e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.792596e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.792596e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.454738 sec + 10,760,210,593 cycles # 3.110 GHz + 28,354,229,298 instructions # 2.64 insn per cycle + 3.460997731 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.612188e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.618351e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.618351e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.131632 sec - 10,129,423,232 cycles:u # 3.205 GHz (74.97%) - 48,255,673 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.97%) - 71,858,708 stalled-cycles-backend:u # 0.71% backend cycles idle (74.97%) - 21,702,694,393 instructions:u # 2.14 insn per cycle - # 0.00 stalled cycles per insn (74.82%) - 3.164603233 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.420212e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.670206e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.670206e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.989459 sec + 9,269,380,262 cycles # 3.097 GHz + 21,587,653,666 instructions # 2.33 insn per cycle + 2.995960242 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.260862e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.707458e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.707458e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.658223 sec - 8,544,190,465 cycles:u # 3.178 GHz (74.97%) - 48,674,787 stalled-cycles-frontend:u # 0.57% frontend cycles idle (75.01%) - 133,578,012 stalled-cycles-backend:u # 1.56% backend cycles idle (75.01%) - 15,840,793,330 instructions:u # 1.85 insn per cycle - # 0.01 stalled cycles per insn (75.01%) - 2.692235669 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1479) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.611672e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.040459e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.040459e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.798403 sec + 8,372,237,978 cycles # 2.987 GHz + 15,943,054,462 instructions # 1.90 insn per cycle + 2.804906138 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.809188e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.542047e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.542047e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.625748 sec + 7,852,939,253 cycles # 2.985 GHz + 15,369,604,507 instructions # 1.96 insn per cycle + 2.631901573 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2179) (512y: 307) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.255448e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.294259e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.294259e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.188310 sec + 7,384,984,169 cycles # 2.313 GHz + 13,880,431,877 instructions # 1.88 insn per cycle + 3.194595387 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 853) (512y: 69) (512z: 905) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 5d6584ccba..8a5dca8407 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_18:17:31 +DATE: 2024-02-05_21:04:17 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.916462e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.225716e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.982310e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.414034 sec - 937,519,241 cycles:u # 2.165 GHz (74.85%) - 2,159,096 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.76%) - 5,666,910 stalled-cycles-backend:u # 0.60% backend cycles idle (75.20%) - 1,814,478,545 instructions:u # 1.94 insn per cycle - # 0.00 stalled cycles per insn (75.92%) - 0.461552135 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.090292e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.087271e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.295454e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.579975 sec + 2,414,557,687 cycles # 3.000 GHz + 3,747,446,339 instructions # 1.55 insn per cycle + 0.879481305 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.274551e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.469484e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.469484e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.640760 sec - 17,255,169,189 cycles:u # 3.045 GHz (74.95%) - 38,908,929 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.02%) - 34,845,885 stalled-cycles-backend:u # 0.20% backend cycles idle (75.02%) - 47,149,536,158 instructions:u # 2.73 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 5.670509554 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.093315e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.289133e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.289133e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.114541 sec + 18,579,851,378 cycles # 3.036 GHz + 47,045,898,593 instructions # 2.53 insn per cycle + 6.128370923 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.698799e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.860879e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.860879e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.065307 sec - 9,186,630,006 cycles:u # 2.971 GHz (74.94%) - 40,710,785 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.91%) - 649,259,763 stalled-cycles-backend:u # 7.07% backend cycles idle (74.93%) - 22,235,538,357 instructions:u # 2.42 insn per cycle - # 0.03 stalled cycles per insn (74.93%) - 3.096130434 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.229543e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.408352e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.408352e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.196404 sec + 9,243,413,602 cycles # 2.887 GHz + 22,093,191,316 instructions # 2.39 insn per cycle + 3.216612535 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.083899e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.517332e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.517332e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.771129 sec - 8,184,044,617 cycles:u # 2.924 GHz (74.90%) - 43,196,009 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.87%) - 1,429,360,940 stalled-cycles-backend:u # 17.47% backend cycles idle (74.87%) - 15,580,671,066 instructions:u # 1.90 insn per cycle - # 0.09 stalled cycles per insn (74.95%) - 2.802871026 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.646884e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.120078e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.120078e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.730676 sec + 8,172,441,197 cycles # 2.987 GHz + 15,624,936,554 instructions # 1.91 insn per cycle + 2.751599284 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.682429e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.266485e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.266485e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.705183 sec + 7,880,109,299 cycles # 2.907 GHz + 15,297,032,323 instructions # 1.94 insn per cycle + 2.721809360 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.780173e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.417878e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.417878e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.614704 sec + 6,408,894,473 cycles # 2.446 GHz + 12,623,358,022 instructions # 1.97 insn per cycle + 2.635161616 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index f07a388ac4..63369f8db6 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,175 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_19:08:40 +DATE: 2024-02-05_21:55:27 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.601612e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.308011e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.308011e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 5.356447 sec - 17,736,085,096 cycles:u # 3.294 GHz (74.99%) - 116,600,593 stalled-cycles-frontend:u # 0.66% frontend cycles idle (74.99%) - 6,936,520,666 stalled-cycles-backend:u # 39.11% backend cycles idle (75.04%) - 17,072,655,798 instructions:u # 0.96 insn per cycle - # 0.41 stalled cycles per insn (75.05%) - 5.411029879 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.329739e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.607347e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.607347e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.648968 sec + 5,746,677,815 cycles # 3.062 GHz + 10,387,478,908 instructions # 1.81 insn per cycle + 1.933666455 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.413194e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.639057e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.639057e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.183772 sec - 17,450,185,792 cycles:u # 3.345 GHz (75.01%) - 39,608,463 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.00%) - 44,336,860 stalled-cycles-backend:u # 0.25% backend cycles idle (75.00%) - 47,390,057,919 instructions:u # 2.72 insn per cycle - # 0.00 stalled cycles per insn (74.94%) - 5.218741499 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.099961e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.290947e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.290947e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.174779 sec + 19,223,896,861 cycles # 3.111 GHz + 47,194,228,256 instructions # 2.45 insn per cycle + 6.182172402 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.873566e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.045756e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.045756e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.955526 sec - 9,625,837,302 cycles:u # 3.220 GHz (74.85%) - 41,879,056 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.85%) - 646,421,280 stalled-cycles-backend:u # 6.72% backend cycles idle (74.98%) - 23,590,603,216 instructions:u # 2.45 insn per cycle - # 0.03 stalled cycles per insn (75.11%) - 2.992557705 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.285304e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.422206e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.422206e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.222898 sec + 10,014,499,753 cycles # 3.102 GHz + 23,429,401,993 instructions # 2.34 insn per cycle + 3.229900393 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.297838e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.817354e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.817354e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.668494 sec - 8,641,490,092 cycles:u # 3.199 GHz (74.83%) - 41,544,998 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.83%) - 1,454,939,462 stalled-cycles-backend:u # 16.84% backend cycles idle (74.96%) - 16,609,731,375 instructions:u # 1.92 insn per cycle - # 0.09 stalled cycles per insn (75.11%) - 2.705371701 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.519465e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.836911e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.836911e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.979965 sec + 8,932,946,167 cycles # 2.992 GHz + 16,752,151,880 instructions # 1.88 insn per cycle + 2.986773953 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.602197e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.058006e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.058006e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.895405 sec + 8,631,584,781 cycles # 2.975 GHz + 16,422,465,858 instructions # 1.90 insn per cycle + 2.902263166 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.626828e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.049108e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.049108e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.871620 sec + 7,175,875,520 cycles # 2.494 GHz + 13,849,689,792 instructions # 1.93 insn per cycle + 2.878604820 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 5ec4d4ab6c..7e45b462eb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,165 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_19:20:09 +DATE: 2024-02-05_22:08:41 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.802337e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.196569e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.957531e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.536180 sec - 15,016,874,212 cycles:u # 3.295 GHz (74.91%) - 53,693,692 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.93%) - 7,009,068,488 stalled-cycles-backend:u # 46.67% backend cycles idle (75.07%) - 11,008,824,349 instructions:u # 0.73 insn per cycle - # 0.64 stalled cycles per insn (75.07%) - 4.580610638 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.307848e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.174799e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.254654e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 +TOTAL : 1.148146 sec + 4,162,615,810 cycles # 3.046 GHz + 6,633,625,649 instructions # 1.59 insn per cycle + 1.425956052 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.424112e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.654453e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.654453e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.110958e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.307650e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.307650e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.087804 sec - 17,240,457,051 cycles:u # 3.371 GHz (74.97%) - 39,475,809 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.97%) - 39,235,588 stalled-cycles-backend:u # 0.23% backend cycles idle (74.97%) - 47,225,654,656 instructions:u # 2.74 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 5.116304782 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.341153 sec + 19,568,239,445 cycles # 3.084 GHz + 47,229,460,572 instructions # 2.41 insn per cycle + 6.347007883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.921746e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.147753e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.147753e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.331196e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.570674e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.570674e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.830451 sec - 9,319,060,198 cycles:u # 3.261 GHz (74.93%) - 41,069,989 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.09%) - 645,303,255 stalled-cycles-backend:u # 6.92% backend cycles idle (75.09%) - 22,133,332,086 instructions:u # 2.38 insn per cycle - # 0.03 stalled cycles per insn (75.09%) - 2.859939720 seconds time elapsed +TOTAL : 3.377654 sec + 10,245,146,540 cycles # 3.029 GHz + 22,173,719,356 instructions # 2.16 insn per cycle + 3.383537599 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.415975e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.004327e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.004327e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.607110e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.069423e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.069423e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.511277 sec - 8,232,656,213 cycles:u # 3.245 GHz (74.82%) - 42,044,882 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.95%) - 1,462,340,926 stalled-cycles-backend:u # 17.76% backend cycles idle (75.09%) - 15,504,031,361 instructions:u # 1.88 insn per cycle - # 0.09 stalled cycles per insn (75.09%) - 2.539529796 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +TOTAL : 3.090252 sec + 9,206,173,868 cycles # 2.975 GHz + 15,535,610,413 instructions # 1.69 insn per cycle + 3.096199232 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.726874e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.384844e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.384844e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.983648 sec + 8,939,068,101 cycles # 2.992 GHz + 15,006,420,771 instructions # 1.68 insn per cycle + 2.989393384 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.761883e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.389409e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.389409e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.961475 sec + 7,435,706,316 cycles # 2.509 GHz + 12,333,144,638 instructions # 1.66 insn per cycle + 2.967308412 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 084018ae7a..eccc4446a6 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,169 +1,212 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_19:16:25 +DATE: 2024-02-05_22:02:07 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.352208e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.992270e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.700820e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 5.295666 sec - 17,656,305,360 cycles:u # 3.317 GHz (74.94%) - 117,895,335 stalled-cycles-frontend:u # 0.67% frontend cycles idle (74.91%) - 6,965,456,924 stalled-cycles-backend:u # 39.45% backend cycles idle (74.95%) - 16,720,237,335 instructions:u # 0.95 insn per cycle - # 0.42 stalled cycles per insn (75.04%) - 5.346362564 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.242306e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.165230e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.195778e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.444358 sec + 5,084,597,164 cycles # 3.054 GHz + 9,275,960,840 instructions # 1.82 insn per cycle + 1.722136335 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.425932e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.655535e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.655535e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.086483 sec - 17,217,885,511 cycles:u # 3.367 GHz (74.97%) - 39,329,051 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.97%) - 38,892,111 stalled-cycles-backend:u # 0.23% backend cycles idle (74.98%) - 47,250,198,154 instructions:u # 2.74 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 5.115965831 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.115036e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.313739e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.313739e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.993942 sec + 18,570,897,233 cycles # 3.097 GHz + 47,045,577,349 instructions # 2.53 insn per cycle + 5.999677453 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.929891e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.150810e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.150810e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.824875 sec - 9,281,749,018 cycles:u # 3.255 GHz (75.00%) - 40,736,236 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.04%) - 617,474,929 stalled-cycles-backend:u # 6.65% backend cycles idle (75.04%) - 22,147,277,727 instructions:u # 2.39 insn per cycle - # 0.03 stalled cycles per insn (75.04%) - 2.854201201 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.400769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.663094e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.663094e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.965370 sec + 9,232,101,571 cycles # 3.108 GHz + 22,091,285,431 instructions # 2.39 insn per cycle + 2.971262386 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.420886e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.010214e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.010214e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.515060 sec - 8,209,191,572 cycles:u # 3.230 GHz (74.85%) - 42,338,926 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.85%) - 1,453,034,382 stalled-cycles-backend:u # 17.70% backend cycles idle (74.89%) - 15,532,414,795 instructions:u # 1.89 insn per cycle - # 0.09 stalled cycles per insn (75.05%) - 2.544513289 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.574500e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.992207e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.992207e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.803871 sec + 8,156,775,877 cycles # 2.904 GHz + 15,624,194,262 instructions # 1.92 insn per cycle + 2.809755883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.764006e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.416109e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.416109e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.625956 sec + 7,852,454,396 cycles # 2.985 GHz + 15,295,794,447 instructions # 1.95 insn per cycle + 2.631853276 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.751259e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.355715e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.355715e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.642188 sec + 6,396,687,295 cycles # 2.417 GHz + 12,623,285,928 instructions # 1.97 insn per cycle + 2.647901833 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 7370ecd908..e0c72c5e2b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_18:17:57 +DATE: 2024-02-05_21:04:47 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.914450e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.243059e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.013320e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.409693 sec - 939,976,117 cycles:u # 2.194 GHz (73.83%) - 2,210,035 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.72%) - 5,004,797 stalled-cycles-backend:u # 0.53% backend cycles idle (75.80%) - 1,809,251,298 instructions:u # 1.92 insn per cycle - # 0.00 stalled cycles per insn (75.67%) - 0.456092705 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.091038e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.089941e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.319291e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.579294 sec + 2,335,364,905 cycles # 2.891 GHz + 3,618,466,633 instructions # 1.55 insn per cycle + 0.885152629 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.393713e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.641850e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.641850e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.245299 sec - 15,999,006,907 cycles:u # 3.034 GHz (74.97%) - 40,433,716 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.97%) - 28,298,756 stalled-cycles-backend:u # 0.18% backend cycles idle (74.98%) - 44,060,621,572 instructions:u # 2.75 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 5.275168966 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.170365e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.389464e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.389464e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.728515 sec + 17,727,054,378 cycles # 3.093 GHz + 43,885,619,946 instructions # 2.48 insn per cycle + 5.742377896 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 467) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.750362e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.958887e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.958887e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.024286 sec - 8,985,652,165 cycles:u # 2.945 GHz (74.88%) - 41,957,731 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.85%) - 136,530,468 stalled-cycles-backend:u # 1.52% backend cycles idle (74.94%) - 21,748,114,495 instructions:u # 2.42 insn per cycle - # 0.01 stalled cycles per insn (75.07%) - 3.055192320 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.388452e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.722234e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.722234e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.990798 sec + 9,083,778,420 cycles # 3.032 GHz + 21,582,115,658 instructions # 2.38 insn per cycle + 3.009778662 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.132254e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.620812e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.620812e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.740350 sec - 8,097,875,361 cycles:u # 2.925 GHz (74.88%) - 43,314,820 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.88%) - 1,759,136,170 stalled-cycles-backend:u # 21.72% backend cycles idle (74.94%) - 15,336,449,761 instructions:u # 1.89 insn per cycle - # 0.11 stalled cycles per insn (75.09%) - 2.772292539 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2524) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.653242e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.141586e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.141586e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.723711 sec + 8,113,273,564 cycles # 2.973 GHz + 15,429,717,708 instructions # 1.90 insn per cycle + 2.745601764 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2542) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.767282e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.438840e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.438840e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.623612 sec + 7,842,213,849 cycles # 2.983 GHz + 15,086,650,215 instructions # 1.92 insn per cycle + 2.642651290 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2323) (512y: 15) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.853427e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.667539e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.667539e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.566015 sec + 6,180,775,263 cycles # 2.404 GHz + 12,245,115,281 instructions # 1.98 insn per cycle + 2.584349048 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1538) (512y: 8) (512z: 1258) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052431359538E-002 +Relative difference = 1.895346165094282e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 892dd28020..2ca4079866 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_18:49:54 +DATE: 2024-02-05_21:44:47 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.916931e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.213424e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.976795e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.380532 sec - 928,460,266 cycles:u # 2.336 GHz (74.61%) - 2,128,809 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.93%) - 5,749,211 stalled-cycles-backend:u # 0.62% backend cycles idle (75.91%) - 1,808,802,965 instructions:u # 1.95 insn per cycle - # 0.00 stalled cycles per insn (75.99%) - 0.426363917 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.294311e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188610e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.298765e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.558468 sec + 2,361,222,111 cycles # 3.020 GHz + 3,664,682,465 instructions # 1.55 insn per cycle + 0.839090549 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.897857e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.340213e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.340213e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.989075 sec - 13,209,705,815 cycles:u # 3.290 GHz (74.90%) - 39,659,904 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.91%) - 1,255,460,951 stalled-cycles-backend:u # 9.50% backend cycles idle (74.92%) - 38,009,003,114 instructions:u # 2.88 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 4.018523538 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.515808e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.899432e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.899432e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 4.497522 sec + 13,733,531,110 cycles # 3.050 GHz + 37,847,942,431 instructions # 2.76 insn per cycle + 4.503720248 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039543819614E-002 -Relative difference = 3.5561191488957804e-08 +Avg ME (F77/C++) = 1.2828039414671366E-002 +Relative difference = 4.562884388571957e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.447145e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.299888e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.299888e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.499550 sec - 8,065,061,748 cycles:u # 3.192 GHz (74.99%) - 41,243,845 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.99%) - 240,300,906 stalled-cycles-backend:u # 2.98% backend cycles idle (74.99%) - 18,644,458,983 instructions:u # 2.31 insn per cycle - # 0.01 stalled cycles per insn (75.01%) - 2.530724959 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.870371e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.896075e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.896075e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.543054 sec + 7,908,771,565 cycles # 3.104 GHz + 18,602,738,348 instructions # 2.35 insn per cycle + 2.549080007 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.820494e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.969123e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.969123e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.323103 sec - 7,440,529,285 cycles:u # 3.167 GHz (74.86%) - 41,648,732 stalled-cycles-frontend:u # 0.56% frontend cycles idle (75.00%) - 1,090,351,915 stalled-cycles-backend:u # 14.65% backend cycles idle (74.98%) - 14,376,530,797 instructions:u # 1.93 insn per cycle - # 0.08 stalled cycles per insn (74.98%) - 2.353403381 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2233) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.917953e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.876983e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.876983e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.514474 sec + 7,422,980,759 cycles # 2.947 GHz + 14,339,966,909 instructions # 1.93 insn per cycle + 2.520873084 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053246266791E-002 +Relative difference = 2.5306003563303186e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.019478e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.132034e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.132034e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.432856 sec + 7,272,187,783 cycles # 2.983 GHz + 13,954,061,773 instructions # 1.92 insn per cycle + 2.439104287 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3875) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053337216261E-002 -Relative difference = 2.601499261602198e-07 +Avg ME (F77/C++) = 1.2828053277189611E-002 +Relative difference = 2.5547059841227576e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.695630e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.321192e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.321192e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.703955 sec + 6,276,486,435 cycles # 2.319 GHz + 13,210,471,023 instructions # 2.10 insn per cycle + 2.710431861 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1734) (512y: 3) (512z: 1266) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052540498902E-002 +Relative difference = 1.980424851420537e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index fd4bcefacd..b4c719017f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_18:50:16 +DATE: 2024-02-05_21:45:14 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.914948e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.243896e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.010655e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.380778 sec - 968,057,402 cycles:u # 2.420 GHz (74.18%) - 2,210,095 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.00%) - 4,956,770 stalled-cycles-backend:u # 0.51% backend cycles idle (74.85%) - 1,792,765,521 instructions:u # 1.85 insn per cycle - # 0.00 stalled cycles per insn (76.46%) - 0.426408994 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.298701e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.198119e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.328440e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.562549 sec + 2,357,901,564 cycles # 3.003 GHz + 3,618,401,654 instructions # 1.53 insn per cycle + 0.844509832 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.648148e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.582942e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.582942e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.048682 sec - 9,955,006,358 cycles:u # 3.238 GHz (75.02%) - 39,164,420 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.02%) - 30,308,380 stalled-cycles-backend:u # 0.30% backend cycles idle (75.03%) - 28,534,668,727 instructions:u # 2.87 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 3.077460057 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.119202e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.962243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.962243e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.318683 sec + 10,107,573,477 cycles # 3.041 GHz + 28,399,137,457 instructions # 2.81 insn per cycle + 3.324720314 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.799299e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.213042e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.213042e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.329377 sec - 7,459,821,031 cycles:u # 3.166 GHz (74.88%) - 37,456,115 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.90%) - 38,987,114 stalled-cycles-backend:u # 0.52% backend cycles idle (74.90%) - 16,973,419,713 instructions:u # 2.28 insn per cycle - # 0.00 stalled cycles per insn (74.97%) - 2.360322814 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.147008e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.786097e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.786097e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.346802 sec + 7,263,828,292 cycles # 3.089 GHz + 16,785,829,907 instructions # 2.31 insn per cycle + 2.352943440 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.006656e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.441300e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.441300e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.241524 sec - 7,193,817,128 cycles:u # 3.172 GHz (75.00%) - 42,485,934 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.96%) - 351,808,028 stalled-cycles-backend:u # 4.89% backend cycles idle (74.96%) - 13,627,130,798 instructions:u # 1.89 insn per cycle - # 0.03 stalled cycles per insn (74.96%) - 2.271606398 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2064) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.123540e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.405304e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.405304e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.369038 sec + 7,067,469,253 cycles # 2.977 GHz + 13,728,904,699 instructions # 1.94 insn per cycle + 2.375173059 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2082) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053198973066E-002 +Relative difference = 2.4937329255889414e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.164074e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.525165e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.525165e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.342246 sec + 7,005,059,344 cycles # 2.987 GHz + 13,461,252,232 instructions # 1.92 insn per cycle + 2.348361461 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3649) (512y: 12) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053331759293E-002 -Relative difference = 2.597245327285885e-07 +Avg ME (F77/C++) = 1.2828053198973066E-002 +Relative difference = 2.4937329255889414e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.031554e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.093489e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.093489e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.432548 sec + 6,052,738,178 cycles # 2.484 GHz + 12,911,325,567 instructions # 2.13 insn per cycle + 2.438724916 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1671) (512y: 3) (512z: 1155) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052431359538E-002 +Relative difference = 1.895346165094282e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index c82d6365dd..be425be4c7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_18:18:23 +DATE: 2024-02-05_21:05:17 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.317919e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.117489e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.346996e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.535140 sec - 1,268,315,502 cycles:u # 2.268 GHz (75.02%) - 2,382,364 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.52%) - 5,765,171 stalled-cycles-backend:u # 0.45% backend cycles idle (74.15%) - 2,159,570,463 instructions:u # 1.70 insn per cycle - # 0.00 stalled cycles per insn (74.51%) - 0.589109914 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.442335e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.270525e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.142227e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.694348 sec + 2,778,759,265 cycles # 2.987 GHz + 4,351,769,973 instructions # 1.57 insn per cycle + 1.013538948 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590281E-002 -Relative difference = 7.67145406542181e-09 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.120301e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.279766e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.279766e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.407405 sec - 19,623,458,416 cycles:u # 3.048 GHz (74.97%) - 54,753,699 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.03%) - 154,428,151 stalled-cycles-backend:u # 0.79% backend cycles idle (75.03%) - 47,024,297,157 instructions:u # 2.40 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 6.441470641 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 473) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.053043e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.221133e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.221133e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.363891 sec + 19,667,437,924 cycles # 3.088 GHz + 46,970,593,915 instructions # 2.39 insn per cycle + 6.379400508 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.805955e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.283656e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.283656e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.269630 sec - 12,905,192,325 cycles:u # 3.000 GHz (74.93%) - 51,265,561 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.90%) - 2,181,477,361 stalled-cycles-backend:u # 16.90% backend cycles idle (74.93%) - 30,970,710,391 instructions:u # 2.40 insn per cycle - # 0.07 stalled cycles per insn (75.03%) - 4.306090956 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.674094e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.206402e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.206402e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.154494 sec + 12,528,254,523 cycles # 3.011 GHz + 30,922,253,447 instructions # 2.47 insn per cycle + 4.173218933 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.504584e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.300428e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.300428e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.256812 sec - 10,354,666,001 cycles:u # 3.150 GHz (74.94%) - 48,200,633 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.96%) - 901,434,633 stalled-cycles-backend:u # 8.71% backend cycles idle (74.95%) - 19,378,250,348 instructions:u # 1.87 insn per cycle - # 0.05 stalled cycles per insn (74.94%) - 3.291563108 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2101) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.054042e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.864807e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.864807e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.453659 sec + 10,215,405,512 cycles # 2.953 GHz + 19,546,791,186 instructions # 1.91 insn per cycle + 3.473149540 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2119) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.158084e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.088743e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.088743e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.309666 sec + 9,731,831,976 cycles # 2.936 GHz + 18,859,355,725 instructions # 1.94 insn per cycle + 3.328443471 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1850) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.025709e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.816813e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.816813e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.498711 sec + 8,174,773,168 cycles # 2.333 GHz + 14,813,296,851 instructions # 1.81 insn per cycle + 3.518454852 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 64) (512z: 1327) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 2549edf04b..016b4dcfb3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-08_18:18:53 +DATE: 2024-02-05_21:05:51 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.901466e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.593663e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.920117e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.523632 sec - 1,231,852,545 cycles:u # 2.262 GHz (73.87%) - 2,296,434 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.76%) - 5,025,010 stalled-cycles-backend:u # 0.41% backend cycles idle (75.17%) - 2,041,125,250 instructions:u # 1.66 insn per cycle - # 0.00 stalled cycles per insn (75.36%) - 0.574251304 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.442557e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.276977e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.115971e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.691949 sec + 2,792,230,252 cycles # 3.012 GHz + 4,406,954,821 instructions # 1.58 insn per cycle + 1.006155048 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590284E-002 -Relative difference = 7.67145379496374e-09 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.159622e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.334824e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.334824e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.233625 sec - 18,579,167,948 cycles:u # 2.966 GHz (74.97%) - 50,731,815 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.97%) - 58,751,802 stalled-cycles-backend:u # 0.32% backend cycles idle (74.98%) - 44,764,886,897 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 6.266696472 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 497) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.115199e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.308123e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.308123e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.039453 sec + 18,520,409,160 cycles # 3.065 GHz + 44,592,637,919 instructions # 2.41 insn per cycle + 6.051596986 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 498) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.799612e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.291488e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.291488e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.306201 sec - 12,648,742,085 cycles:u # 2.916 GHz (74.87%) - 50,507,268 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.84%) - 1,829,208,873 stalled-cycles-backend:u # 14.46% backend cycles idle (74.94%) - 30,286,759,013 instructions:u # 2.39 insn per cycle - # 0.06 stalled cycles per insn (75.03%) - 4.341760557 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.733167e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.307863e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.307863e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.027291 sec + 12,198,841,388 cycles # 3.025 GHz + 30,216,598,772 instructions # 2.48 insn per cycle + 4.046858847 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.318823e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.068967e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.068967e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.519779 sec - 10,213,564,621 cycles:u # 2.876 GHz (75.00%) - 56,572,968 stalled-cycles-frontend:u # 0.55% frontend cycles idle (75.00%) - 289,161,532 stalled-cycles-backend:u # 2.83% backend cycles idle (75.00%) - 18,774,056,028 instructions:u # 1.84 insn per cycle - # 0.02 stalled cycles per insn (75.02%) - 3.555444739 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2054) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.049138e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.860594e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.860594e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.461039 sec + 10,275,869,327 cycles # 2.964 GHz + 19,037,482,995 instructions # 1.85 insn per cycle + 3.487028160 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2072) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.220471e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.207666e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.207666e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.222554 sec + 9,576,129,417 cycles # 2.966 GHz + 18,451,864,640 instructions # 1.93 insn per cycle + 3.247543229 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1775) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.403087e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.559501e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.559501e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.009542 sec + 7,207,687,295 cycles # 2.391 GHz + 13,242,362,983 instructions # 1.84 insn per cycle + 3.028852409 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 911) (512y: 56) (512z: 993) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index e419a8a864..5df3cfc728 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_18:19:22 +DATE: 2024-02-05_21:06:24 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.337007e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.962793e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.020690e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.463140 sec - 876,830,409 cycles:u # 2.041 GHz (74.46%) - 2,297,043 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.39%) - 4,959,442 stalled-cycles-backend:u # 0.57% backend cycles idle (75.77%) - 1,390,914,236 instructions:u # 1.59 insn per cycle - # 0.00 stalled cycles per insn (76.13%) - 0.518184364 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.031380e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.141335e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.278634e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.533981 sec + 2,292,597,871 cycles # 2.985 GHz + 3,192,077,403 instructions # 1.39 insn per cycle + 0.845567304 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.224994e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.282415e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.282415e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.909539 sec - 14,979,399,811 cycles:u # 3.032 GHz (74.92%) - 9,752,610 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.90%) - 738,781,241 stalled-cycles-backend:u # 4.93% backend cycles idle (74.95%) - 38,733,604,171 instructions:u # 2.59 insn per cycle - # 0.02 stalled cycles per insn (75.03%) - 4.943604173 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.167153e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.231469e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.231469e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.944054 sec + 14,988,837,417 cycles # 3.029 GHz + 38,722,101,845 instructions # 2.58 insn per cycle + 4.959702244 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.991944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.192633e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.192633e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.822701 sec - 8,530,609,378 cycles:u # 2.989 GHz (74.82%) - 8,934,860 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.91%) - 200,257,724 stalled-cycles-backend:u # 2.35% backend cycles idle (75.05%) - 24,381,384,726 instructions:u # 2.86 insn per cycle - # 0.01 stalled cycles per insn (75.05%) - 2.857525850 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.673555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.879473e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.879473e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.960626 sec + 8,958,008,294 cycles # 3.020 GHz + 24,429,572,303 instructions # 2.73 insn per cycle + 2.978621858 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.771764e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.289617e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.289617e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.745992 sec - 5,163,847,173 cycles:u # 2.904 GHz (74.69%) - 7,925,713 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.62%) - 1,076,944,521 stalled-cycles-backend:u # 20.86% backend cycles idle (74.82%) - 11,575,904,777 instructions:u # 2.24 insn per cycle - # 0.09 stalled cycles per insn (75.04%) - 1.782036752 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.882622e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.400822e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.400822e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.887751 sec + 5,543,029,143 cycles # 2.927 GHz + 11,561,920,461 instructions # 2.09 insn per cycle + 1.907306147 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.804787e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.518217e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.518217e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.646661 sec + 4,830,901,508 cycles # 2.924 GHz + 10,339,492,038 instructions # 2.14 insn per cycle + 1.663574159 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.510361e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.803489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.803489e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.431325 sec + 4,950,210,545 cycles # 2.032 GHz + 7,554,605,170 instructions # 1.53 insn per cycle + 2.448514603 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index e68c61298a..3b9c251b66 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,175 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_19:09:09 +DATE: 2024-02-05_21:56:00 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.947613e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.787187e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.787187e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.237626 sec - 3,741,397,276 cycles:u # 2.935 GHz (74.96%) - 21,950,070 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.16%) - 1,158,796,162 stalled-cycles-backend:u # 30.97% backend cycles idle (75.16%) - 3,864,054,831 instructions:u # 1.03 insn per cycle - # 0.30 stalled cycles per insn (74.88%) - 1.302713629 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.584968e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.894866e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.894866e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.799738 sec + 3,165,879,197 cycles # 3.015 GHz + 4,878,208,543 instructions # 1.54 insn per cycle + 1.107477855 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.520852e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.586471e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.586471e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.421469 sec - 15,062,320,690 cycles:u # 3.374 GHz (74.92%) - 10,443,532 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.92%) - 743,646,927 stalled-cycles-backend:u # 4.94% backend cycles idle (74.94%) - 38,749,016,389 instructions:u # 2.57 insn per cycle - # 0.02 stalled cycles per insn (75.03%) - 4.467786878 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.203709e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.271501e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.271501e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.942943 sec + 15,313,610,911 cycles # 3.094 GHz + 38,782,458,170 instructions # 2.53 insn per cycle + 4.950650950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.490354e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.717831e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.717831e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.604527 sec - 8,669,634,857 cycles:u # 3.274 GHz (74.93%) - 9,990,683 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.93%) - 213,001,071 stalled-cycles-backend:u # 2.46% backend cycles idle (74.94%) - 24,550,642,471 instructions:u # 2.83 insn per cycle - # 0.01 stalled cycles per insn (75.09%) - 2.652317282 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.720233e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.927980e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.927980e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.000337 sec + 9,301,358,484 cycles # 3.094 GHz + 24,611,951,458 instructions # 2.65 insn per cycle + 3.007950946 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.596512e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.175899e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.175899e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.652537 sec - 5,319,982,348 cycles:u # 3.135 GHz (75.05%) - 9,374,611 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.03%) - 1,081,454,167 stalled-cycles-backend:u # 20.33% backend cycles idle (75.02%) - 11,815,370,610 instructions:u # 2.22 insn per cycle - # 0.09 stalled cycles per insn (75.02%) - 1.701362888 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.770866e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.269781e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.269781e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.001192 sec + 5,886,505,539 cycles # 2.932 GHz + 11,848,414,311 instructions # 2.01 insn per cycle + 2.008838193 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.354252e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.965861e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.965861e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.834602 sec + 5,170,533,445 cycles # 2.808 GHz + 10,625,432,779 instructions # 2.05 insn per cycle + 1.842105594 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.444731e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.731238e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.731238e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.545615 sec + 5,312,567,215 cycles # 2.082 GHz + 7,798,505,404 instructions # 1.47 insn per cycle + 2.553150085 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 3f33133cca..e26aca600d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,165 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_19:20:37 +DATE: 2024-02-05_22:09:13 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.537277e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.962263e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.016665e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.584208e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.160059e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.278576e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.071372 sec - 3,196,294,827 cycles:u # 2.913 GHz (75.22%) - 10,622,560 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.92%) - 1,138,534,793 stalled-cycles-backend:u # 35.62% backend cycles idle (74.92%) - 2,985,973,404 instructions:u # 0.93 insn per cycle - # 0.38 stalled cycles per insn (75.18%) - 1.123855307 seconds time elapsed +TOTAL : 0.609246 sec + 2,581,101,746 cycles # 3.036 GHz + 3,713,117,006 instructions # 1.44 insn per cycle + 0.907852678 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.526171e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.593711e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.593711e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.227475e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.294111e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.294111e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.333890 sec - 14,966,789,355 cycles:u # 3.429 GHz (74.91%) - 9,417,395 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.90%) - 819,315,354 stalled-cycles-backend:u # 5.47% backend cycles idle (74.96%) - 38,694,093,641 instructions:u # 2.59 insn per cycle - # 0.02 stalled cycles per insn (75.06%) - 4.368016125 seconds time elapsed +TOTAL : 4.874182 sec + 15,158,379,002 cycles # 3.108 GHz + 38,738,551,190 instructions # 2.56 insn per cycle + 4.880819504 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.520390e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.747995e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.747995e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.767694e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.983118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.983118e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.498026 sec - 8,520,170,360 cycles:u # 3.369 GHz (75.02%) - 9,607,172 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.02%) - 200,455,121 stalled-cycles-backend:u # 2.35% backend cycles idle (75.01%) - 24,437,311,730 instructions:u # 2.87 insn per cycle - # 0.01 stalled cycles per insn (74.88%) - 2.531681558 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.945749 sec + 9,142,701,773 cycles # 3.098 GHz + 24,427,627,687 instructions # 2.67 insn per cycle + 2.951884209 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.690089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.288935e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.288935e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.805965e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.338850e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.338850e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.547010 sec - 5,201,654,467 cycles:u # 3.295 GHz (74.68%) - 9,324,520 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.79%) - 1,064,918,671 stalled-cycles-backend:u # 20.47% backend cycles idle (75.04%) - 11,505,267,087 instructions:u # 2.21 insn per cycle - # 0.09 stalled cycles per insn (75.17%) - 1.580873032 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +TOTAL : 1.974104 sec + 5,727,022,204 cycles # 2.896 GHz + 11,544,923,636 instructions # 2.02 insn per cycle + 1.980353692 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.756149e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.446891e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.446891e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.717912 sec + 5,004,707,628 cycles # 2.905 GHz + 10,287,500,091 instructions # 2.06 insn per cycle + 1.724204003 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.529833e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.832701e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.832701e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.483819 sec + 5,134,216,348 cycles # 2.063 GHz + 7,502,488,127 instructions # 1.46 insn per cycle + 2.490239423 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 8621d1f638..fc219074a0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,169 +1,212 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_19:16:53 +DATE: 2024-02-05_22:02:38 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.809222e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.964400e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.018784e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.180941 sec - 3,644,641,887 cycles:u # 2.998 GHz (75.01%) - 21,509,076 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.99%) - 1,148,002,124 stalled-cycles-backend:u # 31.50% backend cycles idle (74.98%) - 3,857,731,060 instructions:u # 1.06 insn per cycle - # 0.30 stalled cycles per insn (74.96%) - 1.237021055 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.974189e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155593e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271531e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.695991 sec + 2,799,036,357 cycles # 3.024 GHz + 4,374,236,566 instructions # 1.56 insn per cycle + 0.983348330 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.502070e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.566557e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.566557e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.371978 sec - 15,091,805,960 cycles:u # 3.426 GHz (74.95%) - 10,010,529 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.94%) - 814,210,100 stalled-cycles-backend:u # 5.40% backend cycles idle (74.94%) - 38,744,802,078 instructions:u # 2.57 insn per cycle - # 0.02 stalled cycles per insn (74.98%) - 4.407218033 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.222419e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.288906e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.288906e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.824134 sec + 14,966,646,991 cycles # 3.100 GHz + 38,721,963,975 instructions # 2.59 insn per cycle + 4.830722708 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.523918e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.755564e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.755564e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.499500 sec - 8,521,864,904 cycles:u # 3.366 GHz (74.95%) - 10,288,038 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.04%) - 199,650,176 stalled-cycles-backend:u # 2.34% backend cycles idle (75.04%) - 24,356,495,689 instructions:u # 2.86 insn per cycle - # 0.01 stalled cycles per insn (75.05%) - 2.534569681 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.779967e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.992813e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.992813e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.878539 sec + 8,952,214,854 cycles # 3.104 GHz + 24,428,365,936 instructions # 2.73 insn per cycle + 2.884761942 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.693357e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.287111e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.287111e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.545162 sec - 5,188,095,469 cycles:u # 3.288 GHz (74.70%) - 9,172,935 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.94%) - 1,065,100,619 stalled-cycles-backend:u # 20.53% backend cycles idle (75.16%) - 11,463,135,587 instructions:u # 2.21 insn per cycle - # 0.09 stalled cycles per insn (75.17%) - 1.580365960 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.891897e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.405679e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.405679e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.885534 sec + 5,523,928,525 cycles # 2.924 GHz + 11,561,426,307 instructions # 2.09 insn per cycle + 1.891796100 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.812282e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.507614e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.507614e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.645132 sec + 4,825,784,737 cycles # 2.924 GHz + 10,338,194,804 instructions # 2.14 insn per cycle + 1.651332428 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.459424e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.748576e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.748576e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.457548 sec + 4,961,918,439 cycles # 2.015 GHz + 7,553,551,302 instructions # 1.52 insn per cycle + 2.463770367 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index c811de5803..64a6ffae37 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_18:19:47 +DATE: 2024-02-05_21:06:51 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.864130e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.922591e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.975815e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.403916 sec - 868,936,187 cycles:u # 2.036 GHz (73.90%) - 2,274,927 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.40%) - 5,131,368 stalled-cycles-backend:u # 0.59% backend cycles idle (75.66%) - 1,388,595,911 instructions:u # 1.60 insn per cycle - # 0.00 stalled cycles per insn (75.82%) - 0.457989081 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.024315e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.137212e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273722e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.527459 sec + 2,303,911,509 cycles # 3.021 GHz + 3,282,093,306 instructions # 1.42 insn per cycle + 0.841396629 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.161836e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.216439e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.216439e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.046159 sec - 15,376,865,932 cycles:u # 3.029 GHz (74.95%) - 9,473,842 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.96%) - 20,032,982 stalled-cycles-backend:u # 0.13% backend cycles idle (74.96%) - 39,614,495,316 instructions:u # 2.58 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 5.081436215 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.269898e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.339915e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.339915e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.724475 sec + 14,695,477,691 cycles # 3.107 GHz + 39,544,600,287 instructions # 2.69 insn per cycle + 4.738338098 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.948082e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.150404e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.150404e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.858126 sec - 8,747,707,706 cycles:u # 3.025 GHz (74.86%) - 10,198,563 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.83%) - 1,205,896,793 stalled-cycles-backend:u # 13.79% backend cycles idle (74.94%) - 23,497,904,722 instructions:u # 2.69 insn per cycle - # 0.05 stalled cycles per insn (75.08%) - 2.938845264 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1952) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.819317e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.044125e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.044125e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.855662 sec + 8,611,495,269 cycles # 3.013 GHz + 23,577,603,947 instructions # 2.74 insn per cycle + 2.876870171 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1948) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.218452e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.688137e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.688137e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.882617 sec - 5,712,364,152 cycles:u # 2.984 GHz (74.98%) - 8,361,013 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.93%) - 1,034,558,412 stalled-cycles-backend:u # 18.11% backend cycles idle (74.93%) - 13,212,879,207 instructions:u # 2.31 insn per cycle - # 0.08 stalled cycles per insn (74.96%) - 1.918038279 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.386792e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.829860e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.829860e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.054853 sec + 5,976,472,878 cycles # 2.900 GHz + 13,192,591,343 instructions # 2.21 insn per cycle + 2.073701205 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2560) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.863746e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.385624e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.385624e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.894056 sec + 5,556,974,582 cycles # 2.925 GHz + 12,102,123,600 instructions # 2.18 insn per cycle + 1.916239054 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2030) (512y: 278) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.164130e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.414147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.414147e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.622660 sec + 5,367,738,607 cycles # 2.042 GHz + 9,380,734,862 instructions # 1.75 insn per cycle + 2.639729471 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1350) (512y: 88) (512z: 1989) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 6e93c2f99f..7e05113160 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_18:50:37 +DATE: 2024-02-05_21:45:40 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.887107e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.964739e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.018898e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.370653 sec - 906,243,961 cycles:u # 2.307 GHz (74.68%) - 2,241,400 stalled-cycles-frontend:u # 0.25% frontend cycles idle (76.15%) - 5,816,559 stalled-cycles-backend:u # 0.64% backend cycles idle (75.71%) - 1,412,893,858 instructions:u # 1.56 insn per cycle - # 0.00 stalled cycles per insn (75.77%) - 0.419063355 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.570294e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157680e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272948e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.517984 sec + 2,258,663,150 cycles # 3.002 GHz + 3,251,980,093 instructions # 1.44 insn per cycle + 0.809803833 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.820858e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.901952e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.901952e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.893828 sec - 13,216,640,916 cycles:u # 3.368 GHz (74.93%) - 8,991,237 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.94%) - 550,737,421 stalled-cycles-backend:u # 4.17% backend cycles idle (74.93%) - 35,871,958,786 instructions:u # 2.71 insn per cycle - # 0.02 stalled cycles per insn (74.95%) - 3.926487908 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.389882e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.467416e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.467416e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.492234 sec + 13,897,256,384 cycles # 3.090 GHz + 35,848,363,284 instructions # 2.58 insn per cycle + 4.498607059 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.376057e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.595663e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.595663e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.573148 sec - 8,750,160,649 cycles:u # 3.361 GHz (74.86%) - 10,672,247 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.81%) - 2,345,651,364 stalled-cycles-backend:u # 26.81% backend cycles idle (74.95%) - 21,833,904,041 instructions:u # 2.50 insn per cycle - # 0.11 stalled cycles per insn (75.10%) - 2.606979230 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.126893e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.381970e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.381970e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.644860 sec + 8,205,375,516 cycles # 3.096 GHz + 21,906,179,334 instructions # 2.67 insn per cycle + 2.651401622 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.596704e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.037393e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.037393e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.766710 sec - 5,911,286,240 cycles:u # 3.289 GHz (74.98%) - 8,780,066 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.08%) - 2,229,927,098 stalled-cycles-backend:u # 37.72% backend cycles idle (75.08%) - 12,032,978,967 instructions:u # 2.04 insn per cycle - # 0.19 stalled cycles per insn (75.08%) - 1.800991140 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3046) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.864313e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.381523e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.381523e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.894123 sec + 5,543,467,549 cycles # 2.919 GHz + 12,075,161,866 instructions # 2.18 insn per cycle + 1.900622314 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3062) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.357489e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.970171e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.970171e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.754895 sec + 5,133,966,519 cycles # 2.917 GHz + 11,142,120,432 instructions # 2.17 insn per cycle + 1.761680532 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 224) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.405916e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.704474e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.704474e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.490028 sec + 4,802,416,326 cycles # 1.924 GHz + 8,842,419,328 instructions # 1.84 insn per cycle + 2.496721986 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 97) (512z: 2034) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index e2182ea92b..4e18a74fd4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_18:50:59 +DATE: 2024-02-05_21:46:07 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.857507e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.934612e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.988214e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.368713 sec - 871,490,127 cycles:u # 2.227 GHz (73.86%) - 2,160,498 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.35%) - 5,083,056 stalled-cycles-backend:u # 0.58% backend cycles idle (75.46%) - 1,381,538,673 instructions:u # 1.59 insn per cycle - # 0.00 stalled cycles per insn (75.52%) - 0.419240239 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.571168e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.164253e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.280238e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.519386 sec + 2,267,911,515 cycles # 3.016 GHz + 3,251,447,706 instructions # 1.43 insn per cycle + 0.810353004 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.190272e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.296909e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.296909e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.463551 sec - 11,796,590,494 cycles:u # 3.376 GHz (74.88%) - 9,800,385 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.99%) - 23,034,021 stalled-cycles-backend:u # 0.20% backend cycles idle (75.04%) - 35,636,026,969 instructions:u # 3.02 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 3.496623758 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.676449e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.774483e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.774483e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.023481 sec + 12,489,396,939 cycles # 3.100 GHz + 35,729,005,545 instructions # 2.86 insn per cycle + 4.030070872 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.763409e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.020763e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.020763e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.377364 sec - 8,062,357,173 cycles:u # 3.349 GHz (74.79%) - 10,706,692 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.90%) - 1,739,443,425 stalled-cycles-backend:u # 21.57% backend cycles idle (75.06%) - 21,151,073,294 instructions:u # 2.62 insn per cycle - # 0.08 stalled cycles per insn (75.08%) - 2.411386800 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.223056e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.492367e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.492367e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.587451 sec + 8,029,022,316 cycles # 3.097 GHz + 21,259,346,955 instructions # 2.65 insn per cycle + 2.593594031 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.798115e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.412179e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.412179e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.521896 sec - 5,031,958,555 cycles:u # 3.241 GHz (74.85%) - 9,260,998 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.79%) - 293,419,297 stalled-cycles-backend:u # 5.83% backend cycles idle (74.75%) - 11,412,283,229 instructions:u # 2.27 insn per cycle - # 0.03 stalled cycles per insn (74.92%) - 1.556234307 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2354) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.161232e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.723558e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.723558e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.806475 sec + 5,298,285,942 cycles # 2.924 GHz + 11,405,384,263 instructions # 2.15 insn per cycle + 1.812976231 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2370) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.596114e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.252611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.252611e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.694412 sec + 4,989,073,893 cycles # 2.936 GHz + 10,598,434,802 instructions # 2.12 insn per cycle + 1.700669389 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1970) (512y: 162) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.739884e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.066921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.066921e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.317514 sec + 4,718,237,627 cycles # 2.031 GHz + 8,566,667,295 instructions # 1.82 insn per cycle + 2.324277048 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1392) (512y: 70) (512z: 1630) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 6e4e5c02af..8caa99d7b5 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_18:20:11 +DATE: 2024-02-05_21:07:18 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.717084e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.952544e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.117655e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.353871 sec - 770,971,152 cycles:u # 2.054 GHz (73.64%) - 2,234,320 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.19%) - 4,680,600 stalled-cycles-backend:u # 0.61% backend cycles idle (75.49%) - 1,369,089,853 instructions:u # 1.78 insn per cycle - # 0.00 stalled cycles per insn (75.15%) - 0.403399827 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.268619e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.598652e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.970928e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.479559 sec + 2,115,745,361 cycles # 3.004 GHz + 3,026,067,491 instructions # 1.43 insn per cycle + 0.780309314 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.694112e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.777074e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.777074e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.050256 sec - 12,639,314,034 cycles:u # 3.100 GHz (74.91%) - 7,170,658 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.92%) - 6,335,718 stalled-cycles-backend:u # 0.05% backend cycles idle (75.01%) - 37,070,689,405 instructions:u # 2.93 insn per cycle - # 0.00 stalled cycles per insn (75.09%) - 4.081116452 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.373505e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.452479e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.452479e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.502538 sec + 13,910,563,154 cycles # 3.086 GHz + 37,077,613,443 instructions # 2.67 insn per cycle + 4.514374712 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.510638e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.885491e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.885491e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 2.060524 sec - 6,352,569,577 cycles:u # 3.042 GHz (75.10%) - 7,069,743 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.10%) - 2,199,024,394 stalled-cycles-backend:u # 34.62% backend cycles idle (75.11%) - 15,190,984,241 instructions:u # 2.39 insn per cycle - # 0.14 stalled cycles per insn (75.11%) - 2.092123124 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.415916e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.883206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.883206e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.020931 sec + 6,164,585,136 cycles # 3.041 GHz + 15,211,692,086 instructions # 2.47 insn per cycle + 2.038780280 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.110080e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.257453e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.257453e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.099872 sec - 3,366,039,321 cycles:u # 2.985 GHz (74.70%) - 7,543,952 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.05%) - 922,135,812 stalled-cycles-backend:u # 27.40% backend cycles idle (75.18%) - 7,686,666,680 instructions:u # 2.28 insn per cycle - # 0.12 stalled cycles per insn (75.18%) - 1.131211150 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.178438e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.056543e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056543e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.231884 sec + 3,441,665,218 cycles # 2.780 GHz + 7,715,516,449 instructions # 2.24 insn per cycle + 1.250456436 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.053344e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.232505e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.232505e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.080384 sec + 3,174,573,431 cycles # 2.923 GHz + 7,109,345,653 instructions # 2.24 insn per cycle + 1.096556108 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.797220e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.705551e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.705551e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.430051 sec + 2,982,263,461 cycles # 2.077 GHz + 5,763,882,815 instructions # 1.93 insn per cycle + 1.451099483 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 98175f490e..efebdc8ccf 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,175 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_19:09:32 +DATE: 2024-02-05_21:56:28 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.469110e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.066051e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.066051e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.161093 sec - 3,534,619,605 cycles:u # 2.971 GHz (75.18%) - 21,007,733 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.14%) - 1,154,175,574 stalled-cycles-backend:u # 32.65% backend cycles idle (75.07%) - 3,864,038,545 instructions:u # 1.09 insn per cycle - # 0.30 stalled cycles per insn (74.88%) - 1.219336608 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.147887e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.526108e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.526108e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.661489 sec + 2,693,816,911 cycles # 3.036 GHz + 4,104,498,443 instructions # 1.52 insn per cycle + 0.946519385 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.989598e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.079878e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.079878e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.693726 sec - 12,717,760,292 cycles:u # 3.411 GHz (74.92%) - 7,367,503 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.90%) - 19,690,684 stalled-cycles-backend:u # 0.15% backend cycles idle (74.90%) - 37,107,308,562 instructions:u # 2.92 insn per cycle - # 0.00 stalled cycles per insn (74.96%) - 3.730652000 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.324858e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.402467e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.402467e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.637720 sec + 14,089,911,598 cycles # 3.040 GHz + 37,125,365,010 instructions # 2.63 insn per cycle + 4.644678781 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.079130e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.473563e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.473563e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.917660 sec - 6,440,768,919 cycles:u # 3.300 GHz (75.00%) - 7,702,194 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.00%) - 2,203,771,040 stalled-cycles-backend:u # 34.22% backend cycles idle (75.03%) - 15,518,583,539 instructions:u # 2.41 insn per cycle - # 0.14 stalled cycles per insn (75.03%) - 1.955037082 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.462474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.934411e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.934411e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.047964 sec + 6,366,264,162 cycles # 3.100 GHz + 15,491,842,039 instructions # 2.43 insn per cycle + 2.055155435 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.210191e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.364051e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.364051e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.056488 sec - 3,430,795,268 cycles:u # 3.146 GHz (74.97%) - 7,447,191 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.06%) - 937,957,545 stalled-cycles-backend:u # 27.34% backend cycles idle (75.06%) - 7,890,493,966 instructions:u # 2.30 insn per cycle - # 0.12 stalled cycles per insn (75.12%) - 1.093907892 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.528714e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.093980e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.093980e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.229951 sec + 3,645,466,406 cycles # 2.949 GHz + 7,952,982,935 instructions # 2.18 insn per cycle + 1.237058942 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.028940e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.203411e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.203411e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.149866 sec + 3,370,367,591 cycles # 2.919 GHz + 7,347,327,720 instructions # 2.18 insn per cycle + 1.156955542 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.733701e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.626137e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.626137e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.486462 sec + 3,189,866,182 cycles # 2.137 GHz + 6,021,180,514 instructions # 1.89 insn per cycle + 1.493653390 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index c402bf0e6a..7acf133e3f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,165 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_19:20:59 +DATE: 2024-02-05_22:09:40 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.128644e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.957374e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.121793e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.014163 sec - 3,102,163,524 cycles:u # 2.988 GHz (74.63%) - 10,658,601 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.97%) - 1,156,816,869 stalled-cycles-backend:u # 37.29% backend cycles idle (75.35%) - 2,768,693,006 instructions:u # 0.89 insn per cycle - # 0.42 stalled cycles per insn (75.39%) - 1.061213020 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.412916e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.619089e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.939568e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 +TOTAL : 0.556731 sec + 2,380,708,912 cycles # 3.021 GHz + 3,476,777,878 instructions # 1.46 insn per cycle + 0.845720258 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.995911e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.086114e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.086114e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.642503 sec - 12,638,106,769 cycles:u # 3.445 GHz (74.93%) - 7,311,831 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.93%) - 14,963,186 stalled-cycles-backend:u # 0.12% backend cycles idle (74.94%) - 37,156,096,327 instructions:u # 2.94 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 3.670771442 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.373825e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.451993e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.451993e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.553769 sec + 14,061,285,831 cycles # 3.085 GHz + 37,107,009,606 instructions # 2.64 insn per cycle + 4.559663349 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.314296e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.749749e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.749749e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.802842 sec - 6,199,351,076 cycles:u # 3.389 GHz (74.81%) - 6,748,605 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.03%) - 2,101,796,505 stalled-cycles-backend:u # 33.90% backend cycles idle (75.07%) - 15,211,927,423 instructions:u # 2.45 insn per cycle - # 0.14 stalled cycles per insn (75.08%) - 1.830971384 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.488905e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.966686e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.966686e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404711e-03 ) GeV^0 +TOTAL : 2.047136 sec + 6,328,275,118 cycles # 3.084 GHz + 15,224,386,903 instructions # 2.41 insn per cycle + 2.052913694 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.221195e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.377891e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.377891e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.001392 sec - 3,363,118,873 cycles:u # 3.273 GHz (74.97%) - 7,454,848 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.09%) - 918,947,185 stalled-cycles-backend:u # 27.32% backend cycles idle (75.09%) - 7,657,243,210 instructions:u # 2.28 insn per cycle - # 0.12 stalled cycles per insn (75.10%) - 1.029676638 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.247802e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.063995e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.063995e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.274630 sec + 3,616,593,672 cycles # 2.827 GHz + 7,699,869,910 instructions # 2.13 insn per cycle + 1.280330551 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.052801e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.234165e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.234165e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.134805 sec + 3,350,043,075 cycles # 2.939 GHz + 7,059,303,612 instructions # 2.11 insn per cycle + 1.140687023 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.769621e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.675649e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.675649e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.489497 sec + 3,171,881,548 cycles # 2.123 GHz + 5,713,001,657 instructions # 1.80 insn per cycle + 1.495609576 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 7e24e0cb32..a5259989f5 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,169 +1,212 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_19:17:16 +DATE: 2024-02-05_22:03:04 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.203851e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.926479e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.088864e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.148947 sec - 3,515,379,084 cycles:u # 3.036 GHz (75.11%) - 22,072,547 stalled-cycles-frontend:u # 0.63% frontend cycles idle (75.20%) - 1,149,905,913 stalled-cycles-backend:u # 32.71% backend cycles idle (75.19%) - 3,771,155,190 instructions:u # 1.07 insn per cycle - # 0.30 stalled cycles per insn (75.19%) - 1.197866574 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.958404e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.636826e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.959534e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.609516 sec + 2,471,096,266 cycles # 2.985 GHz + 3,829,067,264 instructions # 1.55 insn per cycle + 0.888079293 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.990218e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.080220e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.080220e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.650953 sec - 12,641,654,314 cycles:u # 3.436 GHz (74.99%) - 7,078,341 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.99%) - 11,932,077 stalled-cycles-backend:u # 0.09% backend cycles idle (74.99%) - 37,053,528,887 instructions:u # 2.93 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 3.680952514 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.336964e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.415294e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.415294e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.572372 sec + 13,889,600,632 cycles # 3.035 GHz + 37,077,606,876 instructions # 2.67 insn per cycle + 4.578241412 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.112952e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.515228e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.515228e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.860685 sec - 6,366,354,024 cycles:u # 3.371 GHz (75.01%) - 6,927,995 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.01%) - 2,205,196,706 stalled-cycles-backend:u # 34.64% backend cycles idle (75.01%) - 15,202,556,643 instructions:u # 2.39 insn per cycle - # 0.15 stalled cycles per insn (75.01%) - 1.890454940 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.532200e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.012572e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.012572e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.978043 sec + 6,163,217,538 cycles # 3.108 GHz + 15,211,693,777 instructions # 2.47 insn per cycle + 1.984000649 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.224299e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.381818e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.381818e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.000758 sec - 3,385,540,430 cycles:u # 3.291 GHz (74.59%) - 7,948,412 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.98%) - 921,498,919 stalled-cycles-backend:u # 27.22% backend cycles idle (75.12%) - 7,650,419,045 instructions:u # 2.26 insn per cycle - # 0.12 stalled cycles per insn (75.12%) - 1.030474482 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.669054e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112341e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112341e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.167764 sec + 3,437,424,675 cycles # 2.931 GHz + 7,714,739,146 instructions # 2.24 insn per cycle + 1.173813048 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.060939e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.242481e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.242481e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.072289 sec + 3,171,680,355 cycles # 2.945 GHz + 7,108,503,211 instructions # 2.24 insn per cycle + 1.078280755 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.899146e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.828249e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.828249e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.411176 sec + 2,980,683,396 cycles # 2.105 GHz + 5,762,290,086 instructions # 1.93 insn per cycle + 1.417037826 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 55e5f17e9c..5fa43347d8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_18:20:33 +DATE: 2024-02-05_21:07:42 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.869687e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.117495e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.301836e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.329222 sec - 758,075,333 cycles:u # 2.173 GHz (73.98%) - 2,198,225 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.40%) - 5,131,410 stalled-cycles-backend:u # 0.68% backend cycles idle (75.50%) - 1,351,085,970 instructions:u # 1.78 insn per cycle - # 0.00 stalled cycles per insn (75.69%) - 0.378253825 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.331904e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.630922e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.023726e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.478203 sec + 2,110,495,883 cycles # 2.993 GHz + 3,011,337,942 instructions # 1.43 insn per cycle + 0.782662262 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.683405e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.765766e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.765766e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.064858 sec - 12,705,974,677 cycles:u # 3.105 GHz (74.98%) - 7,225,703 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) - 9,464,671 stalled-cycles-backend:u # 0.07% backend cycles idle (74.98%) - 37,493,096,667 instructions:u # 2.95 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 4.094241095 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.399483e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.481184e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.481184e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.454439 sec + 13,804,474,973 cycles # 3.096 GHz + 37,479,357,412 instructions # 2.72 insn per cycle + 4.468292450 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.637462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.178307e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.178307e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.736673 sec - 5,359,824,438 cycles:u # 3.038 GHz (74.86%) - 7,182,035 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.05%) - 1,296,947,856 stalled-cycles-backend:u # 24.20% backend cycles idle (75.07%) - 15,243,397,165 instructions:u # 2.84 insn per cycle - # 0.09 stalled cycles per insn (75.07%) - 1.768220444 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.226031e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.848097e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.848097e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.767871 sec + 5,476,477,924 cycles # 3.088 GHz + 15,244,208,658 instructions # 2.78 insn per cycle + 1.792416092 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2330) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.917228e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.592228e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.592228e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.474582 sec - 4,473,730,027 cycles:u # 2.977 GHz (75.00%) - 7,117,732 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.98%) - 1,664,089,796 stalled-cycles-backend:u # 37.20% backend cycles idle (74.98%) - 9,819,840,116 instructions:u # 2.20 insn per cycle - # 0.17 stalled cycles per insn (74.98%) - 1.506565113 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3734) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.862323e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.568663e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.568663e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.611743 sec + 4,709,783,450 cycles # 2.912 GHz + 9,849,850,399 instructions # 2.09 insn per cycle + 1.628871192 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3750) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186428369954 -Relative difference = 1.7604478492421832e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180243223906 +Relative difference = 1.1988453753912676e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.897868e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.669459e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.669459e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.607965 sec + 4,489,380,710 cycles # 2.782 GHz + 9,202,142,806 instructions # 2.05 insn per cycle + 1.628198262 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3384) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180243223906 +Relative difference = 1.1988453753912676e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.506579e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.128587e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.128587e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.696440 sec + 3,457,553,359 cycles # 2.031 GHz + 6,874,633,785 instructions # 1.99 insn per cycle + 1.719781829 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 8) (512z: 2261) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183217635378 +Relative difference = 1.5859655131013432e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index d8fd9d737f..0f0c0cba4d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_18:51:20 +DATE: 2024-02-05_21:46:32 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.006988e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.955973e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.119970e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.319267 sec - 752,328,917 cycles:u # 2.225 GHz (75.00%) - 2,255,449 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.19%) - 4,913,341 stalled-cycles-backend:u # 0.65% backend cycles idle (75.23%) - 1,335,747,468 instructions:u # 1.78 insn per cycle - # 0.00 stalled cycles per insn (77.06%) - 0.366668404 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.468755e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.656223e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.977409e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.476050 sec + 2,139,983,474 cycles # 3.012 GHz + 3,041,923,459 instructions # 1.42 insn per cycle + 0.769270550 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.194534e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.299073e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.299073e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.425079 sec - 11,759,964,306 cycles:u # 3.407 GHz (74.97%) - 7,329,194 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 1,664,701,710 stalled-cycles-backend:u # 14.16% backend cycles idle (74.97%) - 34,201,803,267 instructions:u # 2.91 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 3.454461902 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.688483e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.789170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.789170e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 3.984623 sec + 12,412,838,771 cycles # 3.111 GHz + 34,217,255,304 instructions # 2.76 insn per cycle + 3.991638757 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.187168e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.752438e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.752438e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.600326 sec - 5,421,260,654 cycles:u # 3.332 GHz (74.93%) - 7,370,627 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.93%) - 2,014,655,554 stalled-cycles-backend:u # 37.16% backend cycles idle (74.72%) - 14,670,168,796 instructions:u # 2.71 insn per cycle - # 0.14 stalled cycles per insn (74.77%) - 1.630912842 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.428453e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.090008e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.090008e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.714349 sec + 5,359,595,376 cycles # 3.117 GHz + 14,586,771,788 instructions # 2.72 insn per cycle + 1.720584618 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198769558221 -Relative difference = 6.06481491495597e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288192580919713 +Relative difference = 1.2721291123071246e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.366961e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.026906e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.026906e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.268598 sec - 4,255,477,612 cycles:u # 3.285 GHz (74.73%) - 8,186,065 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.73%) - 1,645,922,864 stalled-cycles-backend:u # 38.68% backend cycles idle (74.90%) - 9,083,647,172 instructions:u # 2.13 insn per cycle - # 0.18 stalled cycles per insn (75.21%) - 1.299091238 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4485) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.084909e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.076524e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.076524e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.381325 sec + 4,058,929,495 cycles # 2.928 GHz + 9,088,076,266 instructions # 2.24 insn per cycle + 1.387605112 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4501) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186752004549 -Relative difference = 1.6009291367898262e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180499337614 +Relative difference = 2.4612242975974814e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.757180e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.937518e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.937518e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.279969 sec + 3,797,061,121 cycles # 2.955 GHz + 8,440,365,534 instructions # 2.22 insn per cycle + 1.285904868 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4043) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180499337614 +Relative difference = 2.4612242975974814e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.095576e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.636466e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.636466e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.805353 sec + 3,727,936,926 cycles # 2.059 GHz + 7,571,161,321 instructions # 2.03 insn per cycle + 1.811498455 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3646) (512y: 1) (512z: 2853) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183350348845 +Relative difference = 1.6513796936156652e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 112654eea6..494b018564 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_18:51:40 +DATE: 2024-02-05_21:46:55 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.551366e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.113232e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.297209e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.320151 sec - 759,031,987 cycles:u # 2.225 GHz (74.40%) - 2,153,774 stalled-cycles-frontend:u # 0.28% frontend cycles idle (76.10%) - 4,939,796 stalled-cycles-backend:u # 0.65% backend cycles idle (76.08%) - 1,297,465,264 instructions:u # 1.71 insn per cycle - # 0.00 stalled cycles per insn (74.19%) - 0.370145172 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.467486e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.714851e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.049410e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.473680 sec + 2,149,151,229 cycles # 3.027 GHz + 3,029,021,746 instructions # 1.41 insn per cycle + 0.767337928 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.410323e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.529706e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.529706e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.217345 sec - 10,905,311,823 cycles:u # 3.362 GHz (74.96%) - 6,729,741 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.07%) - 247,672,337 stalled-cycles-backend:u # 2.27% backend cycles idle (75.10%) - 35,420,582,291 instructions:u # 3.25 insn per cycle - # 0.01 stalled cycles per insn (75.10%) - 3.247331196 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.769244e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.876189e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.876189e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 3.872564 sec + 11,947,072,264 cycles # 3.082 GHz + 35,407,153,685 instructions # 2.96 insn per cycle + 3.878838879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.673833e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.335478e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.335478e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.509011 sec - 5,063,533,669 cycles:u # 3.297 GHz (75.00%) - 7,796,156 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.00%) - 1,328,081,677 stalled-cycles-backend:u # 26.23% backend cycles idle (75.00%) - 14,007,267,862 instructions:u # 2.77 insn per cycle - # 0.09 stalled cycles per insn (75.00%) - 1.539528991 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.615505e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.365022e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.365022e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.673174 sec + 5,080,150,146 cycles # 3.032 GHz + 14,046,773,867 instructions # 2.77 insn per cycle + 1.679714771 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198892958462 -Relative difference = 5.4565783974899003e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288192554144189 +Relative difference = 1.2589315209891237e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.945097e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.099043e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.099043e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.195679 sec - 3,997,440,574 cycles:u # 3.270 GHz (74.65%) - 7,909,830 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.87%) - 1,422,447,437 stalled-cycles-backend:u # 35.58% backend cycles idle (75.13%) - 8,582,353,514 instructions:u # 2.15 insn per cycle - # 0.17 stalled cycles per insn (75.13%) - 1.225635820 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3406) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.217955e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.248962e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.248962e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.360539 sec + 3,995,329,762 cycles # 2.926 GHz + 8,629,021,722 instructions # 2.16 insn per cycle + 1.366675172 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186836987734 -Relative difference = 1.559041129563128e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180815987289 +Relative difference = 4.021983692325164e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.478461e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.623535e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.623535e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.323023 sec + 3,695,157,256 cycles # 2.782 GHz + 8,100,478,864 instructions # 2.19 insn per cycle + 1.329144793 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3105) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180815987289 +Relative difference = 4.021983692325164e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.395898e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.992762e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.992762e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.723948 sec + 3,575,759,856 cycles # 2.068 GHz + 7,372,918,595 instructions # 2.06 insn per cycle + 1.730165324 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2803) (512y: 1) (512z: 2230) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183569209650 +Relative difference = 1.7592557106041962e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 7755f71ce4..c9935b5b0c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_18:20:55 +DATE: 2024-02-05_21:08:06 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.904056e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.014821e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.070633e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.388289 sec - 864,348,193 cycles:u # 2.121 GHz (73.88%) - 2,130,510 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.67%) - 5,219,675 stalled-cycles-backend:u # 0.60% backend cycles idle (76.29%) - 1,361,129,233 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (76.46%) - 0.442760013 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.019007e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.132370e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269812e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.529781 sec + 2,298,734,230 cycles # 2.997 GHz + 3,298,095,711 instructions # 1.43 insn per cycle + 0.850734215 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243869 -Relative difference = 3.241686434838304e-07 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.216739e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.273764e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.273764e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.926618 sec - 15,301,124,701 cycles:u # 3.086 GHz (75.00%) - 9,211,660 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.99%) - 209,883,841 stalled-cycles-backend:u # 1.37% backend cycles idle (74.99%) - 39,304,216,708 instructions:u # 2.57 insn per cycle - # 0.01 stalled cycles per insn (75.01%) - 4.960995675 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.178285e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.242535e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.242535e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.919545 sec + 15,213,150,802 cycles # 3.089 GHz + 39,292,730,130 instructions # 2.58 insn per cycle + 4.935455317 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.145121e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.361455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.361455e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.726938 sec - 8,377,842,264 cycles:u # 3.037 GHz (74.95%) - 9,175,705 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.07%) - 869,503,656 stalled-cycles-backend:u # 10.38% backend cycles idle (75.07%) - 24,090,997,954 instructions:u # 2.88 insn per cycle - # 0.04 stalled cycles per insn (75.07%) - 2.762975684 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.613457e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.806382e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.806382e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.007093 sec + 8,851,938,276 cycles # 2.938 GHz + 24,094,532,863 instructions # 2.72 insn per cycle + 3.022798970 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.969064e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.490580e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.490580e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.687044 sec - 5,054,763,540 cycles:u # 2.943 GHz (74.85%) - 9,457,236 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.89%) - 470,935,529 stalled-cycles-backend:u # 9.32% backend cycles idle (74.89%) - 11,453,581,126 instructions:u # 2.27 insn per cycle - # 0.04 stalled cycles per insn (74.86%) - 1.721716287 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2451) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.951513e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.479974e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.479974e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.867658 sec + 5,498,255,305 cycles # 2.934 GHz + 11,448,732,219 instructions # 2.08 insn per cycle + 1.885253906 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.941897e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.671977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.671977e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.615140 sec + 4,777,526,896 cycles # 2.947 GHz + 10,317,033,200 instructions # 2.16 insn per cycle + 1.641239162 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2076) (512y: 133) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.596452e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.908012e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.908012e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.388979 sec + 4,869,533,602 cycles # 2.034 GHz + 7,366,156,467 instructions # 1.51 insn per cycle + 2.406026106 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1366) (512y: 69) (512z: 1611) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index aca89217a0..f6af1f82c5 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-08_18:21:19 +DATE: 2024-02-05_21:08:33 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.854985e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.926967e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.980718e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.390795 sec - 884,636,377 cycles:u # 2.137 GHz (74.00%) - 2,082,258 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.82%) - 5,724,825 stalled-cycles-backend:u # 0.65% backend cycles idle (74.93%) - 1,447,624,356 instructions:u # 1.64 insn per cycle - # 0.00 stalled cycles per insn (75.12%) - 0.443869891 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.047759e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.135912e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275542e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.530154 sec + 2,242,423,246 cycles # 2.921 GHz + 3,250,635,158 instructions # 1.45 insn per cycle + 0.850969646 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243869 -Relative difference = 3.241686434838304e-07 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.186066e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.238603e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.238603e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.987560 sec - 15,495,821,711 cycles:u # 3.088 GHz (74.98%) - 9,990,162 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) - 10,915,295 stalled-cycles-backend:u # 0.07% backend cycles idle (74.99%) - 40,144,709,771 instructions:u # 2.59 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 5.021690865 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.204740e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.270958e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.270958e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.862249 sec + 15,082,353,353 cycles # 3.099 GHz + 40,115,404,660 instructions # 2.66 insn per cycle + 4.874821925 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.063270e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.270255e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.270255e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.775738 sec - 8,514,738,248 cycles:u # 3.033 GHz (74.93%) - 11,036,987 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.95%) - 676,892,006 stalled-cycles-backend:u # 7.95% backend cycles idle (74.94%) - 23,573,068,748 instructions:u # 2.77 insn per cycle - # 0.03 stalled cycles per insn (74.95%) - 2.811372386 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.851334e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.075206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.075206e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.828378 sec + 8,690,865,791 cycles # 3.066 GHz + 23,534,050,563 instructions # 2.71 insn per cycle + 2.847900143 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.224432e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.662940e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.662940e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.882137 sec - 5,711,671,390 cycles:u # 2.984 GHz (74.93%) - 9,151,587 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.96%) - 710,469,523 stalled-cycles-backend:u # 12.44% backend cycles idle (74.96%) - 13,147,735,569 instructions:u # 2.30 insn per cycle - # 0.05 stalled cycles per insn (74.93%) - 1.917994686 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.250913e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.667762e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.667762e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.101952 sec + 6,182,697,935 cycles # 2.933 GHz + 13,102,957,598 instructions # 2.12 insn per cycle + 2.120994795 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.627757e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.100688e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.100688e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.968170 sec + 5,762,105,462 cycles # 2.919 GHz + 12,210,003,651 instructions # 2.12 insn per cycle + 1.989376159 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2201) (512y: 282) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.203452e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.459686e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.459686e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.600019 sec + 5,268,712,876 cycles # 2.022 GHz + 8,448,368,712 instructions # 1.60 insn per cycle + 2.618948581 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1324) (512y: 84) (512z: 1919) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 2d74f3e2ca..26d1384e8b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-08_18:21:43 +DATE: 2024-02-05_21:09:01 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.896793e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.042103e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.049979e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.438306 sec - 1,018,654,022 cycles:u # 2.283 GHz (75.30%) - 2,342,193 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.77%) - 5,357,864 stalled-cycles-backend:u # 0.53% backend cycles idle (74.68%) - 1,506,452,711 instructions:u # 1.48 insn per cycle - # 0.00 stalled cycles per insn (74.61%) - 0.486442473 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.517520e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.050894e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.067550e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.463659 sec + 2,055,183,874 cycles # 3.003 GHz + 2,934,312,859 instructions # 1.43 insn per cycle + 0.756953193 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.626166e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.842773e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.848585e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.644740 sec - 1,614,380,153 cycles:u # 2.419 GHz (74.94%) - 2,305,119 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.90%) - 5,060,661 stalled-cycles-backend:u # 0.31% backend cycles idle (75.06%) - 1,903,252,561 instructions:u # 1.18 insn per cycle - # 0.00 stalled cycles per insn (75.71%) - 0.693964886 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.043868e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.322139e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.339141e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.609822 sec + 2,563,240,653 cycles # 3.017 GHz + 3,868,983,956 instructions # 1.51 insn per cycle + 0.911526050 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.664905e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.676215e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.676215e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 6.174747 sec - 19,594,832,239 cycles:u # 3.161 GHz (74.97%) - 3,179,318 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) - 3,393,997,547 stalled-cycles-backend:u # 17.32% backend cycles idle (74.96%) - 57,922,192,784 instructions:u # 2.96 insn per cycle - # 0.06 stalled cycles per insn (74.92%) - 6.201296621 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.614247e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.626993e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.626993e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.289889 sec + 19,495,538,491 cycles # 3.098 GHz + 57,922,287,734 instructions # 2.97 insn per cycle + 6.296761383 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.445350e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.492063e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.492063e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 3.035062 sec - 9,609,956,734 cycles:u # 3.142 GHz (74.92%) - 2,571,796 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.89%) - 2,372,598,985 stalled-cycles-backend:u # 24.69% backend cycles idle (74.90%) - 29,983,159,273 instructions:u # 3.12 insn per cycle - # 0.08 stalled cycles per insn (74.94%) - 3.061682288 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.011387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.057745e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.057745e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.292197 sec + 10,190,979,957 cycles # 3.092 GHz + 29,943,491,460 instructions # 2.94 insn per cycle + 3.306811606 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.097086e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.115819e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.115819e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.519592 sec - 4,808,370,439 cycles:u # 3.115 GHz (74.74%) - 2,569,559 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.96%) - 1,462,517,151 stalled-cycles-backend:u # 30.42% backend cycles idle (75.13%) - 11,197,235,575 instructions:u # 2.33 insn per cycle - # 0.13 stalled cycles per insn (75.13%) - 1.547014856 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.433832e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.604416e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.604416e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.759958 sec + 4,910,813,970 cycles # 2.783 GHz + 11,211,044,010 instructions # 2.28 insn per cycle + 1.771404819 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.126386e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.149735e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149735e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.477093 sec + 4,299,259,698 cycles # 2.902 GHz + 10,187,392,743 instructions # 2.37 insn per cycle + 1.492930763 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.244237e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.368358e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.368358e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.011538 sec + 3,897,655,272 cycles # 1.933 GHz + 5,708,540,517 instructions # 1.46 insn per cycle + 2.026194749 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 0163eb03d1..e40acb18da 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-08_19:09:53 +DATE: 2024-02-05_21:56:52 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.493228e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.015882e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.015882e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.564258 sec - 1,652,077,487 cycles:u # 2.814 GHz (74.62%) - 10,611,849 stalled-cycles-frontend:u # 0.64% frontend cycles idle (75.41%) - 258,215,190 stalled-cycles-backend:u # 15.63% backend cycles idle (75.45%) - 2,020,090,968 instructions:u # 1.22 insn per cycle - # 0.13 stalled cycles per insn (75.09%) - 0.610698195 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.636814e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.747936e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.747936e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.488125 sec + 2,115,646,575 cycles # 3.006 GHz + 3,165,191,660 instructions # 1.50 insn per cycle + 0.761567703 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.195589e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.670385e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.670385e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.268176 sec - 3,822,769,864 cycles:u # 2.920 GHz (75.21%) - 30,801,656 stalled-cycles-frontend:u # 0.81% frontend cycles idle (74.97%) - 859,824,473 stalled-cycles-backend:u # 22.49% backend cycles idle (75.04%) - 3,882,660,072 instructions:u # 1.02 insn per cycle - # 0.22 stalled cycles per insn (74.99%) - 1.331437909 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.698370e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.508650e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.508650e+06 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.819811 sec + 3,231,643,804 cycles # 3.017 GHz + 5,170,954,794 instructions # 1.60 insn per cycle + 1.129350867 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.949222e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.961653e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.961653e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.583285 sec - 19,616,988,865 cycles:u # 3.499 GHz (74.97%) - 3,125,909 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.03%) - 3,422,426,818 stalled-cycles-backend:u # 17.45% backend cycles idle (75.03%) - 57,899,685,241 instructions:u # 2.95 insn per cycle - # 0.06 stalled cycles per insn (75.03%) - 5.608407195 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.586265e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.599432e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.599432e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.365086 sec + 19,535,294,271 cycles # 3.067 GHz + 57,927,366,883 instructions # 2.97 insn per cycle + 6.370310036 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.053353e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.104745e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.104745e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.734583 sec - 9,609,540,434 cycles:u # 3.485 GHz (75.05%) - 1,957,694 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.05%) - 2,381,141,376 stalled-cycles-backend:u # 24.78% backend cycles idle (74.91%) - 29,996,044,463 instructions:u # 3.12 insn per cycle - # 0.08 stalled cycles per insn (74.91%) - 2.761024409 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.890841e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.937483e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.937483e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.379934 sec + 10,238,179,748 cycles # 3.026 GHz + 29,994,338,386 instructions # 2.93 insn per cycle + 3.385379622 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.239724e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.261065e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.261065e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.349709 sec - 4,754,036,505 cycles:u # 3.464 GHz (74.94%) - 2,353,246 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.94%) - 1,461,374,334 stalled-cycles-backend:u # 30.74% backend cycles idle (74.94%) - 11,263,062,636 instructions:u # 2.37 insn per cycle - # 0.13 stalled cycles per insn (74.94%) - 1.375449114 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.771282e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.950160e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.950160e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.706937 sec + 4,944,909,058 cycles # 2.891 GHz + 11,258,821,376 instructions # 2.28 insn per cycle + 1.712018011 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.116302e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.139544e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.139544e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.497508 sec + 4,331,250,443 cycles # 2.884 GHz + 10,237,737,318 instructions # 2.36 insn per cycle + 1.502820787 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.218152e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.344259e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.344259e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.025386 sec + 3,933,406,562 cycles # 1.938 GHz + 5,747,129,673 instructions # 1.46 insn per cycle + 2.030424043 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 884bcfad54..e6e9b880f5 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-08_18:22:10 +DATE: 2024-02-05_21:09:30 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.906147e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.039261e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.046147e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.439845 sec - 1,023,928,489 cycles:u # 2.320 GHz (74.84%) - 2,294,233 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.09%) - 5,690,077 stalled-cycles-backend:u # 0.56% backend cycles idle (74.23%) - 1,552,350,558 instructions:u # 1.52 insn per cycle - # 0.00 stalled cycles per insn (72.68%) - 0.484797136 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.433452e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.035720e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.052077e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.461879 sec + 2,049,463,501 cycles # 3.004 GHz + 2,931,930,967 instructions # 1.43 insn per cycle + 0.758516650 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.592965e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.792693e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.797749e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.644636 sec - 1,588,228,432 cycles:u # 2.368 GHz (75.02%) - 2,221,853 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.02%) - 5,385,824 stalled-cycles-backend:u # 0.34% backend cycles idle (75.03%) - 1,972,260,150 instructions:u # 1.24 insn per cycle - # 0.00 stalled cycles per insn (74.42%) - 0.694016128 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.036270e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309299e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325891e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.604488 sec + 2,554,084,852 cycles # 3.010 GHz + 3,806,222,274 instructions # 1.49 insn per cycle + 0.908408851 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.634125e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.644901e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.644901e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 6.245570 sec - 20,056,849,486 cycles:u # 3.200 GHz (74.99%) - 2,954,288 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 3,112,319,221 stalled-cycles-backend:u # 15.52% backend cycles idle (74.99%) - 57,703,493,427 instructions:u # 2.88 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 6.271210651 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.603637e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.616413e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.616413e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.315367 sec + 19,440,595,699 cycles # 3.076 GHz + 57,746,273,687 instructions # 2.97 insn per cycle + 6.322559095 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.268056e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.312584e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.312584e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 3.136288 sec - 9,733,143,847 cycles:u # 3.080 GHz (74.94%) - 2,615,873 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.94%) - 2,287,852,817 stalled-cycles-backend:u # 23.51% backend cycles idle (74.94%) - 30,344,746,875 instructions:u # 3.12 insn per cycle - # 0.08 stalled cycles per insn (74.96%) - 3.163945104 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.011637e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.057908e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.057908e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.291611 sec + 10,256,692,556 cycles # 3.113 GHz + 30,334,472,962 instructions # 2.96 insn per cycle + 3.303684492 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.054056e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.071587e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.071587e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.580938 sec - 4,894,522,264 cycles:u # 3.049 GHz (75.08%) - 1,978,954 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.08%) - 1,691,604,797 stalled-cycles-backend:u # 34.56% backend cycles idle (75.09%) - 11,675,651,815 instructions:u # 2.39 insn per cycle - # 0.14 stalled cycles per insn (75.09%) - 1.608574799 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4471) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.549169e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.717858e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.717858e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.738062 sec + 5,053,352,211 cycles # 2.901 GHz + 11,664,563,199 instructions # 2.31 insn per cycle + 1.751875656 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4489) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.049432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.069832e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.069832e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.583348 sec + 4,610,661,316 cycles # 2.904 GHz + 10,805,809,627 instructions # 2.34 insn per cycle + 1.595035145 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3988) (512y: 237) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.147443e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.270030e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.270030e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.035235 sec + 3,947,076,552 cycles # 1.936 GHz + 5,998,684,941 instructions # 1.52 insn per cycle + 2.049078012 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1241) (512y: 81) (512z: 3500) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 3375e46eaf..220563e4c6 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-08_18:22:37 +DATE: 2024-02-05_21:09:59 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.934586e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.876139e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.928934e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.415474e+04 +- 1.288238e+04 ) GeV^-2 -TOTAL : 0.383720 sec - 822,049,704 cycles:u # 2.134 GHz (74.53%) - 2,208,723 stalled-cycles-frontend:u # 0.27% frontend cycles idle (76.04%) - 4,843,042 stalled-cycles-backend:u # 0.59% backend cycles idle (75.16%) - 1,393,763,752 instructions:u # 1.70 insn per cycle - # 0.00 stalled cycles per insn (74.22%) - 0.430994484 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.389953e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.379235e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.499735e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.445751 sec + 1,974,295,648 cycles # 2.963 GHz + 2,744,935,553 instructions # 1.39 insn per cycle + 0.739534699 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.370264e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.631551e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.636459e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.619620e+05 +- 1.611328e+05 ) GeV^-2 -TOTAL : 0.472621 sec - 1,133,739,668 cycles:u # 2.300 GHz (72.63%) - 2,155,754 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.19%) - 4,276,204 stalled-cycles-backend:u # 0.38% backend cycles idle (75.55%) - 1,545,443,044 instructions:u # 1.36 insn per cycle - # 0.00 stalled cycles per insn (75.54%) - 0.520942641 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.048502e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.406399e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.505283e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.496598 sec + 2,171,209,286 cycles # 2.997 GHz + 3,099,694,880 instructions # 1.43 insn per cycle + 0.782136858 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669530965212 -Relative difference = 0.0005401804983001964 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.864581e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.878140e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.878140e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.743938 sec - 17,773,852,419 cycles:u # 3.082 GHz (74.91%) - 2,894,955 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.98%) - 3,669,486,104 stalled-cycles-backend:u # 20.65% backend cycles idle (75.03%) - 55,281,356,088 instructions:u # 3.11 insn per cycle - # 0.07 stalled cycles per insn (75.04%) - 5.770116566 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.797017e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.812360e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.812360e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 5.878452 sec + 18,166,915,369 cycles # 3.088 GHz + 55,237,437,169 instructions # 3.04 insn per cycle + 5.886190167 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.491676e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.641990e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.641990e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.750083 sec - 5,438,030,974 cycles:u # 3.066 GHz (74.75%) - 2,225,662 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.78%) - 1,684,378,735 stalled-cycles-backend:u # 30.97% backend cycles idle (74.99%) - 16,144,361,424 instructions:u # 2.97 insn per cycle - # 0.10 stalled cycles per insn (75.19%) - 1.777268015 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.039866e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.198983e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.198983e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.832280 sec + 5,681,092,016 cycles # 3.093 GHz + 16,128,089,686 instructions # 2.84 insn per cycle + 1.846415672 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857118325333 -Relative difference = 2.039421953066926e-07 +Avg ME (F77/C++) = 1.4129864902818952 +Relative difference = 3.469828399449743e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.083535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.154597e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.154597e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.810354 sec - 2,533,172,837 cycles:u # 3.037 GHz (75.07%) - 1,780,574 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.07%) - 832,632,139 stalled-cycles-backend:u # 32.87% backend cycles idle (75.07%) - 6,082,444,674 instructions:u # 2.40 insn per cycle - # 0.14 stalled cycles per insn (75.08%) - 0.838024436 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.890873e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.959141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.959141e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.887375 sec + 2,587,042,735 cycles # 2.901 GHz + 6,085,514,363 instructions # 2.35 insn per cycle + 0.899074779 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.130938e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.235290e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.235290e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.789645 sec + 2,308,576,199 cycles # 2.908 GHz + 5,552,964,613 instructions # 2.41 insn per cycle + 0.803204840 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.637248e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.690128e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.690128e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.023178 sec + 2,012,076,992 cycles # 1.958 GHz + 3,285,913,321 instructions # 1.63 insn per cycle + 1.035051327 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index c81ca50562..f99fe56362 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-08_19:10:18 +DATE: 2024-02-05_21:57:21 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.273252e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.588256e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.588256e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.755516e+02 +- 2.671055e+02 ) GeV^-2 -TOTAL : 0.499256 sec - 1,455,482,979 cycles:u # 2.784 GHz (73.33%) - 10,630,229 stalled-cycles-frontend:u # 0.73% frontend cycles idle (73.99%) - 276,661,876 stalled-cycles-backend:u # 19.01% backend cycles idle (76.12%) - 1,855,040,061 instructions:u # 1.27 insn per cycle - # 0.15 stalled cycles per insn (76.37%) - 0.546123141 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.982111e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.119833e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.119833e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 +TOTAL : 0.457183 sec + 1,967,411,727 cycles # 2.939 GHz + 2,909,797,751 instructions # 1.48 insn per cycle + 0.727045964 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.140904e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.468511e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.468511e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.855934e+03 +- 1.791981e+03 ) GeV^-2 -TOTAL : 1.068236 sec - 3,219,691,880 cycles:u # 2.925 GHz (75.29%) - 29,579,411 stalled-cycles-frontend:u # 0.92% frontend cycles idle (75.36%) - 859,498,606 stalled-cycles-backend:u # 26.70% backend cycles idle (75.33%) - 3,423,691,451 instructions:u # 1.06 insn per cycle - # 0.25 stalled cycles per insn (75.39%) - 1.125475723 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.787238e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.561461e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.561461e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 +TOTAL : 0.635665 sec + 2,588,344,996 cycles # 2.985 GHz + 3,973,729,693 instructions # 1.54 insn per cycle + 0.925601980 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669530965212 -Relative difference = 0.0005401804983001964 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.256660e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.272135e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.272135e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.054670 sec - 17,761,310,035 cycles:u # 3.498 GHz (74.95%) - 2,689,568 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) - 3,663,845,845 stalled-cycles-backend:u # 20.63% backend cycles idle (74.96%) - 55,301,407,855 instructions:u # 3.11 insn per cycle - # 0.07 stalled cycles per insn (75.00%) - 5.079610960 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.799107e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.814416e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.814416e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 5.876793 sec + 18,189,272,845 cycles # 3.093 GHz + 55,241,417,370 instructions # 3.04 insn per cycle + 5.881598403 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.061177e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.077735e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.077735e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.568327 sec - 5,535,132,011 cycles:u # 3.479 GHz (74.86%) - 2,505,823 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.86%) - 1,722,362,524 stalled-cycles-backend:u # 31.12% backend cycles idle (74.86%) - 16,224,209,549 instructions:u # 2.93 insn per cycle - # 0.11 stalled cycles per insn (74.90%) - 1.594493860 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.047168e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.206962e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.206962e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.835268 sec + 5,698,652,492 cycles # 3.098 GHz + 16,175,394,330 instructions # 2.84 insn per cycle + 1.840169365 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857118325333 -Relative difference = 2.039421953066926e-07 +Avg ME (F77/C++) = 1.4129864902818952 +Relative difference = 3.469828399449743e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.348338e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.427878e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.427878e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.721667 sec - 2,578,926,819 cycles:u # 3.465 GHz (74.41%) - 2,120,454 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.92%) - 821,948,498 stalled-cycles-backend:u # 31.87% backend cycles idle (75.28%) - 6,131,768,654 instructions:u # 2.38 insn per cycle - # 0.13 stalled cycles per insn (75.29%) - 0.747585285 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.885686e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953537e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953537e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.893142 sec + 2,601,913,974 cycles # 2.900 GHz + 6,121,802,998 instructions # 2.35 insn per cycle + 0.897968889 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.113688e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.201012e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.201012e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.799670 sec + 2,307,090,883 cycles # 2.871 GHz + 5,589,004,785 instructions # 2.42 insn per cycle + 0.804426909 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.486631e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.532097e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.532097e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.130259 sec + 2,034,132,380 cycles # 1.793 GHz + 3,327,384,092 instructions # 1.64 insn per cycle + 1.135408818 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index d59c4504fd..06f543fbee 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-08_18:23:01 +DATE: 2024-02-05_21:10:24 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.321746e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.937510e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.078623e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.415474e+04 +- 1.288238e+04 ) GeV^-2 -TOTAL : 0.358230 sec - 815,009,587 cycles:u # 2.143 GHz (74.71%) - 2,398,638 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.88%) - 5,610,405 stalled-cycles-backend:u # 0.69% backend cycles idle (74.49%) - 1,368,390,070 instructions:u # 1.68 insn per cycle - # 0.00 stalled cycles per insn (74.06%) - 0.405563659 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.384648e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.366733e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.479160e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.443785 sec + 1,993,953,840 cycles # 3.008 GHz + 2,818,381,366 instructions # 1.41 insn per cycle + 0.736398900 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.427126e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.704218e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.710859e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.619620e+05 +- 1.611328e+05 ) GeV^-2 -TOTAL : 0.489379 sec - 1,126,447,874 cycles:u # 2.287 GHz (73.61%) - 2,329,063 stalled-cycles-frontend:u # 0.21% frontend cycles idle (73.92%) - 4,613,066 stalled-cycles-backend:u # 0.41% backend cycles idle (75.63%) - 1,569,017,375 instructions:u # 1.39 insn per cycle - # 0.00 stalled cycles per insn (75.42%) - 0.536446725 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.037533e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.378680e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.476282e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.496956 sec + 2,155,618,409 cycles # 2.983 GHz + 3,013,698,932 instructions # 1.40 insn per cycle + 0.781903701 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669531526541 -Relative difference = 0.0005401805380429868 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.879133e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.892813e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.892813e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.714809 sec - 17,746,155,755 cycles:u # 3.093 GHz (74.92%) - 3,036,147 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) - 2,957,234,706 stalled-cycles-backend:u # 16.66% backend cycles idle (75.01%) - 55,042,109,625 instructions:u # 3.10 insn per cycle - # 0.05 stalled cycles per insn (75.05%) - 5.740554552 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.794393e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.809440e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.809440e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 5.884011 sec + 18,141,446,314 cycles # 3.081 GHz + 54,990,057,284 instructions # 3.03 insn per cycle + 5.891015557 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.914351e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.007906e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.007906e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.676296 sec - 5,213,242,764 cycles:u # 3.066 GHz (75.06%) - 2,215,862 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.06%) - 1,518,921,044 stalled-cycles-backend:u # 29.14% backend cycles idle (75.06%) - 16,252,369,662 instructions:u # 3.12 insn per cycle - # 0.09 stalled cycles per insn (75.07%) - 1.703555763 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.239102e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.406607e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.406607e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.793435 sec + 5,530,022,881 cycles # 3.076 GHz + 16,222,893,517 instructions # 2.93 insn per cycle + 1.806863598 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857712652836 -Relative difference = 1.618803841657786e-07 +Avg ME (F77/C++) = 1.4129863487235070 +Relative difference = 2.4679898241023883e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.847863e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.904148e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.904148e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.910488 sec - 2,859,225,307 cycles:u # 3.060 GHz (74.57%) - 2,327,605 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.18%) - 820,783,449 stalled-cycles-backend:u # 28.71% backend cycles idle (75.18%) - 6,719,051,225 instructions:u # 2.35 insn per cycle - # 0.12 stalled cycles per insn (75.18%) - 0.937940806 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5412) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.638710e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.689089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.689089e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 1.020116 sec + 2,973,003,499 cycles # 2.902 GHz + 6,707,847,761 instructions # 2.26 insn per cycle + 1.032449924 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5430) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.802877e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.865060e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.865060e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.929310 sec + 2,704,404,518 cycles # 2.897 GHz + 6,222,577,733 instructions # 2.30 insn per cycle + 0.944067617 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5056) (512y: 24) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.524110e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.568915e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.568915e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.097370 sec + 2,152,540,827 cycles # 1.954 GHz + 3,642,461,294 instructions # 1.69 insn per cycle + 1.111654079 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2070) (512y: 21) (512z: 3922) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index e4a164fc84..10aee9994d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-08_18:23:25 +DATE: 2024-02-05_21:10:49 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.900040e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.044051e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.053092e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.422344 sec - 1,018,676,895 cycles:u # 2.291 GHz (73.42%) - 2,552,424 stalled-cycles-frontend:u # 0.25% frontend cycles idle (71.38%) - 5,008,265 stalled-cycles-backend:u # 0.49% backend cycles idle (73.75%) - 1,523,093,464 instructions:u # 1.50 insn per cycle - # 0.00 stalled cycles per insn (75.12%) - 0.469511377 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.411052e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.037219e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.053212e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.462353 sec + 2,044,752,105 cycles # 3.002 GHz + 2,922,291,440 instructions # 1.43 insn per cycle + 0.754029831 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.603353e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.819036e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.824195e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.652036 sec - 1,596,517,317 cycles:u # 2.357 GHz (74.50%) - 2,300,346 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.84%) - 5,526,422 stalled-cycles-backend:u # 0.35% backend cycles idle (75.02%) - 1,967,893,783 instructions:u # 1.23 insn per cycle - # 0.00 stalled cycles per insn (75.22%) - 0.711593351 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.037940e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.312467e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.329162e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.607140 sec + 2,555,586,136 cycles # 3.009 GHz + 3,834,621,246 instructions # 1.50 insn per cycle + 0.907842423 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569483 -Relative difference = 4.4188898885662695e-07 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.590946e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.601468e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.601468e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 6.349934 sec - 20,094,310,453 cycles:u # 3.152 GHz (74.97%) - 2,850,071 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) - 3,928,007,749 stalled-cycles-backend:u # 19.55% backend cycles idle (75.03%) - 59,145,863,191 instructions:u # 2.94 insn per cycle - # 0.07 stalled cycles per insn (75.03%) - 6.399687674 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.525140e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.537527e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.537527e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.512388 sec + 19,949,166,194 cycles # 3.062 GHz + 59,159,604,988 instructions # 2.97 insn per cycle + 6.519783633 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.497517e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.543872e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.543872e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 3.005856 sec - 9,511,244,318 cycles:u # 3.140 GHz (74.94%) - 2,365,507 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) - 2,387,042,931 stalled-cycles-backend:u # 25.10% backend cycles idle (74.92%) - 29,810,359,607 instructions:u # 3.13 insn per cycle - # 0.08 stalled cycles per insn (74.84%) - 3.032697298 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.087383e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.135240e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.135240e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.246233 sec + 10,091,680,067 cycles # 3.109 GHz + 29,764,233,813 instructions # 2.95 insn per cycle + 3.262583152 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.132469e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.152112e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.152112e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.472531 sec - 4,723,246,589 cycles:u # 3.157 GHz (74.95%) - 2,125,901 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.87%) - 1,578,292,094 stalled-cycles-backend:u # 33.42% backend cycles idle (74.87%) - 11,191,081,410 instructions:u # 2.37 insn per cycle - # 0.14 stalled cycles per insn (74.87%) - 1.499599512 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4563) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.852319e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.003597e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.003597e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.685614 sec + 4,874,885,089 cycles # 2.885 GHz + 11,201,070,619 instructions # 2.30 insn per cycle + 1.697607709 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4581) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.141882e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.165680e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.165680e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.457259 sec + 4,227,831,629 cycles # 2.893 GHz + 10,145,806,984 instructions # 2.40 insn per cycle + 1.470346665 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4064) (512y: 73) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.008468e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.125280e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.125280e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.070196 sec + 3,997,404,384 cycles # 1.927 GHz + 5,838,748,265 instructions # 1.46 insn per cycle + 2.086574272 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1778) (512y: 97) (512z: 3502) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index bbf72bf4d2..e07e294a1d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-08_18:23:52 +DATE: 2024-02-05_21:11:18 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.881819e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.030373e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.036991e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.418757 sec - 1,057,766,788 cycles:u # 2.417 GHz (73.84%) - 2,187,648 stalled-cycles-frontend:u # 0.21% frontend cycles idle (74.59%) - 5,024,903 stalled-cycles-backend:u # 0.48% backend cycles idle (75.85%) - 1,531,993,399 instructions:u # 1.45 insn per cycle - # 0.00 stalled cycles per insn (76.30%) - 0.460795481 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.435966e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.038196e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054495e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.462791 sec + 2,083,546,367 cycles # 3.014 GHz + 2,972,290,724 instructions # 1.43 insn per cycle + 0.769446480 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.593384e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.833698e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.838917e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.643290 sec - 1,581,021,170 cycles:u # 2.382 GHz (74.39%) - 2,302,325 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.48%) - 5,629,796 stalled-cycles-backend:u # 0.36% backend cycles idle (74.89%) - 1,950,257,352 instructions:u # 1.23 insn per cycle - # 0.00 stalled cycles per insn (75.96%) - 0.690981216 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.031258e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.302322e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318791e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.605517 sec + 2,552,548,170 cycles # 3.005 GHz + 3,840,818,275 instructions # 1.50 insn per cycle + 0.908380997 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569483 -Relative difference = 4.4188898885662695e-07 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.585955e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.596594e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.596594e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 6.362147 sec - 20,187,670,852 cycles:u # 3.162 GHz (74.94%) - 2,859,211 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) - 3,681,425,458 stalled-cycles-backend:u # 18.24% backend cycles idle (74.95%) - 58,740,442,273 instructions:u # 2.91 insn per cycle - # 0.06 stalled cycles per insn (75.01%) - 6.387668065 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.591923e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.604576e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.604576e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.344174 sec + 19,690,002,782 cycles # 3.102 GHz + 58,706,037,230 instructions # 2.98 insn per cycle + 6.351179172 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.563902e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.612203e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.612203e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.970623 sec - 9,394,603,985 cycles:u # 3.137 GHz (74.89%) - 2,356,159 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.89%) - 2,113,017,475 stalled-cycles-backend:u # 22.49% backend cycles idle (74.91%) - 30,189,438,679 instructions:u # 3.21 insn per cycle - # 0.07 stalled cycles per insn (75.03%) - 2.998060317 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.074664e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.121928e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.121928e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.250511 sec + 10,100,144,581 cycles # 3.103 GHz + 30,158,060,846 instructions # 2.99 insn per cycle + 3.262078242 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.097936e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.116705e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.116705e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.517904 sec - 4,826,653,513 cycles:u # 3.131 GHz (74.84%) - 2,480,491 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.03%) - 1,558,060,205 stalled-cycles-backend:u # 32.28% backend cycles idle (75.09%) - 11,661,614,013 instructions:u # 2.42 insn per cycle - # 0.13 stalled cycles per insn (75.09%) - 1.545034243 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4667) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.144723e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.313489e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.313489e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.815494 sec + 5,030,331,211 cycles # 2.763 GHz + 11,663,521,674 instructions # 2.32 insn per cycle + 1.831265530 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4685) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.066889e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.088085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.088085e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.557471 sec + 4,540,720,285 cycles # 2.907 GHz + 10,787,106,557 instructions # 2.38 insn per cycle + 1.573334698 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4159) (512y: 233) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.801981e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.913458e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.913458e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.124298 sec + 4,048,675,940 cycles # 1.902 GHz + 6,072,800,594 instructions # 1.50 insn per cycle + 2.136798569 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1725) (512y: 104) (512z: 3609) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 51fae84f72..4386f9eaa6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_18:24:18 +DATE: 2024-02-05_21:11:47 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.419893e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.570341e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.571892e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.703629 sec - 1,839,027,261 cycles:u # 2.685 GHz (73.35%) - 2,323,224 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.61%) - 6,233,225 stalled-cycles-backend:u # 0.34% backend cycles idle (75.46%) - 2,082,581,761 instructions:u # 1.13 insn per cycle - # 0.00 stalled cycles per insn (75.55%) - 0.749197644 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.496016e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.529637e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.532403e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.525349 sec + 2,294,512,437 cycles # 2.993 GHz + 3,492,349,564 instructions # 1.52 insn per cycle + 0.840403466 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.246279e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.248702e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.248758e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 7.717029 sec - 23,764,997,698 cycles:u # 3.068 GHz (74.98%) - 3,331,784 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 5,721,672 stalled-cycles-backend:u # 0.02% backend cycles idle (74.99%) - 19,028,265,618 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 7.771005889 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.126320e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.166438e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.167839e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.035102 sec + 10,125,737,942 cycles # 3.073 GHz + 21,079,378,058 instructions # 2.08 insn per cycle + 3.352111070 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.975465e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.976259e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.976259e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.310636 sec - 26,068,173,975 cycles:u # 3.128 GHz (74.95%) - 20,587,445 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.96%) - 3,852,440,225 stalled-cycles-backend:u # 14.78% backend cycles idle (74.98%) - 81,789,228,591 instructions:u # 3.14 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 8.336579974 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.913785e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.914720e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914720e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.579787 sec + 26,438,116,213 cycles # 3.080 GHz + 81,752,514,663 instructions # 3.09 insn per cycle + 8.587422083 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.509483e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.513670e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.513670e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.645427 sec - 11,461,884,241 cycles:u # 3.124 GHz (74.95%) - 1,107,292 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.93%) - 1,663,035,588 stalled-cycles-backend:u # 14.51% backend cycles idle (74.93%) - 39,247,572,695 instructions:u # 3.42 insn per cycle - # 0.04 stalled cycles per insn (74.95%) - 3.672796225 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.829757e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.833256e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.833256e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.292364 sec + 12,887,054,299 cycles # 2.999 GHz + 39,241,639,513 instructions # 3.05 insn per cycle + 4.302239285 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.079252e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.081598e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.081598e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.528018 sec - 4,829,699,409 cycles:u # 3.113 GHz (74.80%) - 707,883 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.75%) - 574,218,895 stalled-cycles-backend:u # 11.89% backend cycles idle (74.79%) - 13,835,747,077 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 1.555123003 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.577458e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.594978e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.594978e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.923283 sec + 5,565,898,996 cycles # 2.889 GHz + 13,789,682,242 instructions # 2.48 insn per cycle + 1.937602465 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.748302e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.771041e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.771041e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.692940 sec + 4,898,171,848 cycles # 2.887 GHz + 12,318,410,362 instructions # 2.51 insn per cycle + 1.708522131 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.787552e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.802476e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.802476e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.115967 sec + 4,056,542,434 cycles # 1.913 GHz + 6,286,251,479 instructions # 1.55 insn per cycle + 2.129797525 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 1e61565f06..eb520cfa63 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_19:11:09 +DATE: 2024-02-05_21:58:20 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.384904e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.522670e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.522670e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.661073 sec - 1,963,722,469 cycles:u # 2.878 GHz (75.53%) - 2,436,720 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.96%) - 53,166,474 stalled-cycles-backend:u # 2.71% backend cycles idle (74.28%) - 2,231,069,227 instructions:u # 1.14 insn per cycle - # 0.02 stalled cycles per insn (74.82%) - 0.706607210 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.145693e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489458e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.489458e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.517339 sec + 2,180,189,349 cycles # 2.926 GHz + 3,535,379,038 instructions # 1.62 insn per cycle + 0.806007511 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.209932e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.244631e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.244631e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.541213 sec - 29,335,442,337 cycles:u # 3.417 GHz (74.94%) - 22,204,977 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.96%) - 1,148,495,427 stalled-cycles-backend:u # 3.92% backend cycles idle (75.02%) - 23,524,172,196 instructions:u # 0.80 insn per cycle - # 0.05 stalled cycles per insn (75.03%) - 8.611621716 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.625721e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.093314e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.093314e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.299968 sec + 11,007,925,791 cycles # 3.081 GHz + 23,447,985,186 instructions # 2.13 insn per cycle + 3.629584738 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.228682e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.229582e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.229582e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.370176 sec - 25,875,705,823 cycles:u # 3.500 GHz (75.00%) - 3,536,166 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 3,941,158,888 stalled-cycles-backend:u # 15.23% backend cycles idle (75.01%) - 81,723,638,272 instructions:u # 3.16 insn per cycle - # 0.05 stalled cycles per insn (75.00%) - 7.395046657 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.909621e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.910509e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.910509e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.601768 sec + 26,468,647,489 cycles # 3.079 GHz + 81,762,792,143 instructions # 3.09 insn per cycle + 8.606858859 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.047525e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.052116e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.052116e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.260880 sec - 11,461,230,297 cycles:u # 3.490 GHz (74.92%) - 1,057,133 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.91%) - 1,664,799,119 stalled-cycles-backend:u # 14.53% backend cycles idle (74.82%) - 39,285,224,142 instructions:u # 3.43 insn per cycle - # 0.04 stalled cycles per insn (74.95%) - 3.287293927 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.825773e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.829632e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.829632e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.300735 sec + 12,906,768,826 cycles # 2.998 GHz + 39,253,927,939 instructions # 3.04 insn per cycle + 4.305912555 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.206630e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.209265e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.209265e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.370722 sec - 4,816,403,203 cycles:u # 3.456 GHz (74.75%) - 897,807 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.79%) - 595,702,815 stalled-cycles-backend:u # 12.37% backend cycles idle (75.01%) - 13,825,146,605 instructions:u # 2.87 insn per cycle - # 0.04 stalled cycles per insn (75.26%) - 1.396941426 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.611352e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.629243e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.629243e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.918353 sec + 5,573,969,343 cycles # 2.899 GHz + 13,798,757,056 instructions # 2.48 insn per cycle + 1.923671163 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.728272e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.752598e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.752598e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.699696 sec + 4,912,735,798 cycles # 2.884 GHz + 12,327,911,686 instructions # 2.51 insn per cycle + 1.704745554 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.798492e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.814095e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.814095e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.117095 sec + 4,066,575,770 cycles # 1.917 GHz + 6,296,588,952 instructions # 1.55 insn per cycle + 2.122327452 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index dc839449b6..7d82001653 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,182 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_19:21:20 +DATE: 2024-02-05_22:10:04 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.371043e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.569495e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.570900e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.512961e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.540583e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.542758e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.650484 sec - 1,944,232,578 cycles:u # 2.906 GHz (75.06%) - 2,566,490 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.53%) - 47,645,331 stalled-cycles-backend:u # 2.45% backend cycles idle (75.78%) - 2,151,229,336 instructions:u # 1.11 insn per cycle - # 0.02 stalled cycles per insn (74.71%) - 0.692677245 seconds time elapsed +TOTAL : 0.505487 sec + 2,244,702,706 cycles # 3.012 GHz + 3,497,665,121 instructions # 1.56 insn per cycle + 0.813818087 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.245808e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.248801e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.248859e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.142807e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.177113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178528e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.387158 sec - 28,895,428,422 cycles:u # 3.432 GHz (74.94%) - 11,926,413 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.05%) - 1,138,859,957 stalled-cycles-backend:u # 3.94% backend cycles idle (75.03%) - 22,682,042,659 instructions:u # 0.78 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 8.441980465 seconds time elapsed +TOTAL : 3.116341 sec + 10,381,161,571 cycles # 3.080 GHz + 21,983,616,246 instructions # 2.12 insn per cycle + 3.426998706 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.226063e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.227012e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.227012e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.926994e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.927950e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.927950e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.374847 sec - 25,911,004,356 cycles:u # 3.503 GHz (74.98%) - 4,934,775 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) - 3,929,539,299 stalled-cycles-backend:u # 15.17% backend cycles idle (75.02%) - 81,751,179,129 instructions:u # 3.16 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 7.398660960 seconds time elapsed +TOTAL : 8.522111 sec + 26,416,984,323 cycles # 3.099 GHz + 81,751,260,197 instructions # 3.09 insn per cycle + 8.526902879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.043786e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.048334e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.048334e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.824894e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.828545e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.828545e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.259433 sec - 11,462,316,668 cycles:u # 3.494 GHz (74.89%) - 1,101,286 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.89%) - 1,656,842,817 stalled-cycles-backend:u # 14.45% backend cycles idle (74.92%) - 39,291,812,915 instructions:u # 3.43 insn per cycle - # 0.04 stalled cycles per insn (75.05%) - 3.282743813 seconds time elapsed +TOTAL : 4.302603 sec + 12,900,803,176 cycles # 2.997 GHz + 39,242,002,424 instructions # 3.04 insn per cycle + 4.307598417 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.207796e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.210426e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.210426e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.654357e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.672586e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.672586e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.366016 sec - 4,823,946,998 cycles:u # 3.476 GHz (74.68%) - 737,483 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.84%) - 604,858,936 stalled-cycles-backend:u # 12.54% backend cycles idle (75.08%) - 13,809,568,520 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (75.22%) - 1.389473172 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +TOTAL : 1.906130 sec + 5,555,557,944 cycles # 2.911 GHz + 13,787,634,047 instructions # 2.48 insn per cycle + 1.910883796 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.550673e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.573296e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.573296e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.737685 sec + 4,902,417,602 cycles # 2.830 GHz + 12,315,886,492 instructions # 2.51 insn per cycle + 1.742410859 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.807297e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.822744e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.822744e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.112283 sec + 4,060,879,337 cycles # 1.919 GHz + 6,283,466,586 instructions # 1.55 insn per cycle + 2.117099643 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 2f51666cca..ca740aa697 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,187 +1,227 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_19:17:37 +DATE: 2024-02-05_22:03:28 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.457379e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.590055e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.591542e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.647675 sec - 1,950,784,003 cycles:u # 2.898 GHz (74.98%) - 2,955,540 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.15%) - 38,642,817 stalled-cycles-backend:u # 1.98% backend cycles idle (75.15%) - 2,160,763,754 instructions:u # 1.11 insn per cycle - # 0.02 stalled cycles per insn (74.99%) - 0.689968370 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.220163e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.520037e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.522371e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.508013 sec + 2,245,889,851 cycles # 3.028 GHz + 3,557,197,805 instructions # 1.58 insn per cycle + 0.802735999 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.216026e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.248213e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.248271e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.491761 sec - 29,220,773,291 cycles:u # 3.425 GHz (74.96%) - 23,526,526 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.97%) - 1,139,934,734 stalled-cycles-backend:u # 3.90% backend cycles idle (75.01%) - 23,436,671,159 instructions:u # 0.80 insn per cycle - # 0.05 stalled cycles per insn (75.06%) - 8.550830835 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.741423e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.172289e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.173702e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.189427 sec + 10,589,984,166 cycles # 3.075 GHz + 23,506,097,983 instructions # 2.22 insn per cycle + 3.503661605 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.228857e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.229766e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.229766e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.365932 sec - 25,860,325,327 cycles:u # 3.500 GHz (74.99%) - 3,417,700 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 3,793,396,241 stalled-cycles-backend:u # 14.67% backend cycles idle (74.99%) - 81,737,950,688 instructions:u # 3.16 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 7.390284378 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.908714e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.909599e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.909599e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.601904 sec + 26,416,000,686 cycles # 3.070 GHz + 81,751,768,928 instructions # 3.09 insn per cycle + 8.606661074 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.983789e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.988177e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.988177e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.298665 sec - 11,598,734,474 cycles:u # 3.493 GHz (74.95%) - 2,213,945 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) - 1,816,599,463 stalled-cycles-backend:u # 15.66% backend cycles idle (74.95%) - 39,318,197,888 instructions:u # 3.39 insn per cycle - # 0.05 stalled cycles per insn (74.88%) - 3.323078386 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.847274e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.850852e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.850852e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.273544 sec + 12,895,840,707 cycles # 3.016 GHz + 39,241,783,042 instructions # 3.04 insn per cycle + 4.278279611 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.206033e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.208640e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.208640e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.367797 sec - 4,826,714,733 cycles:u # 3.473 GHz (74.68%) - 807,379 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.73%) - 618,304,947 stalled-cycles-backend:u # 12.81% backend cycles idle (75.01%) - 13,810,963,654 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (75.26%) - 1.391784798 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.616490e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.634593e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.634593e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.912479 sec + 5,552,432,995 cycles # 2.897 GHz + 13,787,867,550 instructions # 2.48 insn per cycle + 1.917408982 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.750174e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.773348e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.773348e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.691764 sec + 4,896,428,138 cycles # 2.888 GHz + 12,317,937,826 instructions # 2.52 insn per cycle + 1.696849640 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.676695e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.690901e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.690901e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.146460 sec + 4,051,401,304 cycles # 1.885 GHz + 6,286,186,428 instructions # 1.55 insn per cycle + 2.151328455 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 5f9cd740db..c1b031e169 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_18:24:57 +DATE: 2024-02-05_21:12:24 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.405379e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.460398e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.460958e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.528781 sec - 1,385,687,149 cycles:u # 2.539 GHz (74.99%) - 2,277,657 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.18%) - 5,647,445 stalled-cycles-backend:u # 0.41% backend cycles idle (75.23%) - 1,723,056,293 instructions:u # 1.24 insn per cycle - # 0.00 stalled cycles per insn (75.27%) - 0.573805843 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.477440e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.511011e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.513409e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.523123 sec + 2,259,536,719 cycles # 2.994 GHz + 3,495,615,515 instructions # 1.55 insn per cycle + 0.827170133 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.739001e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.743812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.743938e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 6.363734 sec - 19,704,908,340 cycles:u # 3.083 GHz (74.87%) - 2,953,635 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.90%) - 5,788,233 stalled-cycles-backend:u # 0.03% backend cycles idle (74.97%) - 15,876,708,731 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 6.416629497 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.144882e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.182235e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.183656e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.022410 sec + 10,059,246,665 cycles # 3.067 GHz + 22,982,457,538 instructions # 2.28 insn per cycle + 3.336684992 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.987007e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.987813e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.987813e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.261995 sec - 26,146,192,849 cycles:u # 3.156 GHz (75.00%) - 19,881,039 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.99%) - 3,292,723,902 stalled-cycles-backend:u # 12.59% backend cycles idle (74.99%) - 81,718,806,984 instructions:u # 3.13 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 8.288021078 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.895668e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.896543e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.896543e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.662390 sec + 26,452,377,540 cycles # 3.053 GHz + 81,778,558,563 instructions # 3.09 insn per cycle + 8.669649159 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.520526e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.524263e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.524263e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.635288 sec - 11,497,017,579 cycles:u # 3.143 GHz (74.94%) - 1,199,037 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) - 1,535,871,724 stalled-cycles-backend:u # 13.36% backend cycles idle (75.07%) - 39,259,745,497 instructions:u # 3.41 insn per cycle - # 0.04 stalled cycles per insn (75.07%) - 3.661377334 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.794479e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.797927e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.797927e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.332491 sec + 12,903,575,926 cycles # 2.976 GHz + 39,248,558,231 instructions # 3.04 insn per cycle + 4.346983128 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.075641e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.077978e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.077978e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.532948 sec - 4,831,457,512 cycles:u # 3.104 GHz (74.88%) - 662,593 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.82%) - 567,155,151 stalled-cycles-backend:u # 11.74% backend cycles idle (74.82%) - 13,833,428,313 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (74.86%) - 1.560029589 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11030) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.616252e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.633907e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.633907e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.912847 sec + 5,552,920,900 cycles # 2.896 GHz + 13,804,312,506 instructions # 2.49 insn per cycle + 1.924450522 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11048) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.848997e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.873421e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.873421e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.675121 sec + 4,877,663,558 cycles # 2.905 GHz + 12,329,320,941 instructions # 2.53 insn per cycle + 1.686814675 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9736) (512y: 94) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.700547e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.714897e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.714897e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.139777 sec + 4,102,041,327 cycles # 1.913 GHz + 6,293,429,861 instructions # 1.53 insn per cycle + 2.153998058 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 8a7a242de3..076951b3cb 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_18:51:59 +DATE: 2024-02-05_21:47:18 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.423206e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.593883e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.594749e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.637916 sec - 1,865,547,239 cycles:u # 2.851 GHz (74.19%) - 2,330,064 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.04%) - 5,743,309 stalled-cycles-backend:u # 0.31% backend cycles idle (75.53%) - 2,088,439,685 instructions:u # 1.12 insn per cycle - # 0.00 stalled cycles per insn (76.09%) - 0.684161271 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.222522e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.247216e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.249235e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.529631 sec + 2,304,451,425 cycles # 3.007 GHz + 3,606,899,880 instructions # 1.57 insn per cycle + 0.823545603 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.242800e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.245327e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.245392e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 7.696446 sec - 25,980,025,319 cycles:u # 3.365 GHz (75.07%) - 3,610,890 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.06%) - 6,327,050 stalled-cycles-backend:u # 0.02% backend cycles idle (75.03%) - 20,725,161,616 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 7.747812070 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.770931e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.799512e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.800677e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.294556 sec + 10,953,172,236 cycles # 3.084 GHz + 24,077,449,681 instructions # 2.20 insn per cycle + 3.611746200 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.523005e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.523398e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.523398e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 36.267689 sec - 126,724,659,584 cycles:u # 3.492 GHz (75.00%) - 103,201,545 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.00%) - 16,889,689,256 stalled-cycles-backend:u # 13.33% backend cycles idle (75.00%) - 141,470,243,970 instructions:u # 1.12 insn per cycle - # 0.12 stalled cycles per insn (75.00%) - 36.292327891 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21543) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.479098e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.479597e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.479597e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 36.629078 sec + 112,948,926,771 cycles # 3.084 GHz + 141,510,082,785 instructions # 1.25 insn per cycle + 36.633991445 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.659008e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.661448e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.661448e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.490054 sec - 15,779,015,944 cycles:u # 3.497 GHz (75.00%) - 2,135,494 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 7,458,486,693 stalled-cycles-backend:u # 47.27% backend cycles idle (75.00%) - 37,532,441,528 instructions:u # 2.38 insn per cycle - # 0.20 stalled cycles per insn (75.00%) - 4.515783744 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.310376e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.313138e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.313138e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.964302 sec + 14,929,585,775 cycles # 3.005 GHz + 37,533,096,829 instructions # 2.51 insn per cycle + 4.969441025 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.591587e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.601950e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.601950e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.168280 sec - 7,642,041,008 cycles:u # 3.488 GHz (74.70%) - 1,070,869 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) - 4,390,924,971 stalled-cycles-backend:u # 57.46% backend cycles idle (75.17%) - 12,967,898,753 instructions:u # 1.70 insn per cycle - # 0.34 stalled cycles per insn (75.18%) - 2.194145921 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46575) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.873423e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.888108e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.888108e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.092961 sec + 6,035,782,399 cycles # 2.879 GHz + 12,947,327,941 instructions # 2.15 insn per cycle + 2.097818635 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46593) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.606441e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.629322e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.629322e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.717293 sec + 4,994,518,363 cycles # 2.902 GHz + 11,363,079,717 instructions # 2.28 insn per cycle + 1.722220234 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40158) (512y: 279) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.116630e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.132912e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.132912e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.031067 sec + 3,895,544,556 cycles # 1.915 GHz + 5,853,606,324 instructions # 1.50 insn per cycle + 2.035959095 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2112) (512y: 142) (512z:39211) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index cef84849ff..24a5052bbf 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_18:53:09 +DATE: 2024-02-05_21:48:26 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.396830e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.450949e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.451460e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.521706 sec - 1,481,318,938 cycles:u # 2.748 GHz (74.81%) - 2,169,944 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.29%) - 5,423,573 stalled-cycles-backend:u # 0.37% backend cycles idle (76.12%) - 1,790,069,205 instructions:u # 1.21 insn per cycle - # 0.00 stalled cycles per insn (76.17%) - 0.566631028 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.253640e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.278160e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.280144e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.527924 sec + 2,278,775,837 cycles # 3.017 GHz + 3,521,445,566 instructions # 1.55 insn per cycle + 0.812423009 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.738741e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.743652e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.743775e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 6.347524 sec - 21,797,324,711 cycles:u # 3.416 GHz (74.97%) - 3,038,624 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 4,838,637 stalled-cycles-backend:u # 0.02% backend cycles idle (74.98%) - 17,450,782,853 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.10%) - 6.406516717 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.792881e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.821756e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.822944e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.270966 sec + 10,872,573,045 cycles # 3.074 GHz + 24,270,572,338 instructions # 2.23 insn per cycle + 3.593854665 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.563502e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.563872e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.563872e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 35.944992 sec - 126,053,509,205 cycles:u # 3.505 GHz (75.00%) - 11,395,374 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 18,435,255,585 stalled-cycles-backend:u # 14.62% backend cycles idle (75.00%) - 141,645,144,485 instructions:u # 1.12 insn per cycle - # 0.13 stalled cycles per insn (75.00%) - 35.969683395 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21831) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.439063e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.439545e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.439545e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 36.958546 sec + 113,902,158,027 cycles # 3.082 GHz + 141,695,308,963 instructions # 1.24 insn per cycle + 36.963459772 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21615) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.631101e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.633522e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.633522e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.524338 sec - 15,911,871,415 cycles:u # 3.500 GHz (74.97%) - 5,229,595 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.02%) - 7,656,219,059 stalled-cycles-backend:u # 48.12% backend cycles idle (75.02%) - 37,566,202,173 instructions:u # 2.36 insn per cycle - # 0.20 stalled cycles per insn (75.02%) - 4.550369242 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.314783e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.317486e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.317486e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.957427 sec + 14,881,560,951 cycles # 3.000 GHz + 37,592,648,542 instructions # 2.53 insn per cycle + 4.962388959 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.703077e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.713803e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.713803e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.136816 sec - 7,529,467,375 cycles:u # 3.487 GHz (74.70%) - 1,210,559 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.88%) - 4,270,335,713 stalled-cycles-backend:u # 56.71% backend cycles idle (75.18%) - 12,852,865,982 instructions:u # 1.71 insn per cycle - # 0.33 stalled cycles per insn (75.18%) - 2.162649621 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45645) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.059572e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.075102e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.075102e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.044832 sec + 5,940,592,925 cycles # 2.900 GHz + 12,831,574,033 instructions # 2.16 insn per cycle + 2.049914401 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45663) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.581576e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.604096e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.604096e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.721363 sec + 4,986,832,806 cycles # 2.890 GHz + 11,359,238,555 instructions # 2.28 insn per cycle + 1.726371578 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:39855) (512y: 212) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.082101e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.098338e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.098338e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.039340 sec + 3,890,542,782 cycles # 1.904 GHz + 5,843,067,877 instructions # 1.50 insn per cycle + 2.044270183 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1687) (512y: 116) (512z:38946) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index db75b7c23c..2a26f6c49e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_18:25:33 +DATE: 2024-02-05_21:13:01 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.603654e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.780655e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.781379e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.440792 sec - 1,154,541,552 cycles:u # 2.519 GHz (74.24%) - 2,198,707 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.83%) - 4,451,469 stalled-cycles-backend:u # 0.39% backend cycles idle (75.14%) - 1,580,717,351 instructions:u # 1.37 insn per cycle - # 0.00 stalled cycles per insn (75.33%) - 0.489512861 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.334488e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.395305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.401279e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.479871 sec + 2,082,107,480 cycles # 2.987 GHz + 3,068,311,706 instructions # 1.47 insn per cycle + 0.776340123 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.700797e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.727360e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.727823e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.641675 sec - 7,997,770,380 cycles:u # 3.004 GHz (74.83%) - 2,555,358 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.88%) - 4,924,419 stalled-cycles-backend:u # 0.06% backend cycles idle (75.08%) - 6,831,619,214 instructions:u # 0.85 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 2.687486810 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.553737e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.638533e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.641807e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.723684 sec + 5,920,743,472 cycles # 3.029 GHz + 12,600,426,909 instructions # 2.13 insn per cycle + 2.014393536 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.258730e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.259735e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.259735e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 7.267483 sec - 23,372,915,591 cycles:u # 3.206 GHz (74.98%) - 1,342,333 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 2,729,199,994 stalled-cycles-backend:u # 11.68% backend cycles idle (74.98%) - 75,832,248,218 instructions:u # 3.24 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 7.293005862 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.101564e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.102651e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.102651e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 7.813063 sec + 24,199,832,283 cycles # 3.096 GHz + 75,878,496,374 instructions # 3.14 insn per cycle + 7.820369454 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.243925e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.254539e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.254539e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 2.271192 sec - 6,771,611,711 cycles:u # 2.948 GHz (74.97%) - 1,269,186 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.93%) - 1,159,889,931 stalled-cycles-backend:u # 17.13% backend cycles idle (74.93%) - 20,147,624,321 instructions:u # 2.98 insn per cycle - # 0.06 stalled cycles per insn (74.96%) - 2.305102688 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.355480e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.368368e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.368368e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.239129 sec + 6,484,377,988 cycles # 2.892 GHz + 20,115,449,226 instructions # 3.10 insn per cycle + 2.250294450 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.074907e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.083888e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.083888e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.797255 sec - 2,486,142,212 cycles:u # 3.028 GHz (74.67%) - 651,261 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.67%) - 237,286,614 stalled-cycles-backend:u # 9.54% backend cycles idle (74.74%) - 7,086,981,766 instructions:u # 2.85 insn per cycle - # 0.03 stalled cycles per insn (74.92%) - 0.824650441 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.698731e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.705466e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.705466e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.974985 sec + 2,819,368,645 cycles # 2.882 GHz + 7,038,056,878 instructions # 2.50 insn per cycle + 0.989199229 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.942632e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.951445e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.951445e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.852688 sec + 2,476,830,184 cycles # 2.890 GHz + 6,280,101,874 instructions # 2.54 insn per cycle + 0.869178735 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.560069e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565955e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565955e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.059815 sec + 2,036,227,562 cycles # 1.913 GHz + 3,248,407,876 instructions # 1.60 insn per cycle + 1.071376552 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index fffca73371..bbf3fbe6ee 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_19:11:45 +DATE: 2024-02-05_21:58:57 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.585947e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.760142e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.760142e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 -TOTAL : 0.444317 sec - 1,232,861,961 cycles:u # 2.651 GHz (74.23%) - 3,052,215 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.82%) - 50,695,897 stalled-cycles-backend:u # 4.11% backend cycles idle (76.49%) - 1,628,289,331 instructions:u # 1.32 insn per cycle - # 0.03 stalled cycles per insn (76.75%) - 0.490521428 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.679886e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.329455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.329455e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.467337 sec + 2,052,749,916 cycles # 2.990 GHz + 3,071,904,430 instructions # 1.50 insn per cycle + 0.745966161 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.269703e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.713043e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.713043e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.440256 sec - 11,490,476,382 cycles:u # 3.308 GHz (74.90%) - 38,895,273 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.98%) - 1,129,565,212 stalled-cycles-backend:u # 9.83% backend cycles idle (75.15%) - 9,919,336,188 instructions:u # 0.86 insn per cycle - # 0.11 stalled cycles per insn (75.16%) - 3.499527101 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.238688e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.463206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.463206e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.898849 sec + 6,460,514,384 cycles # 3.024 GHz + 13,711,212,771 instructions # 2.12 insn per cycle + 2.194001425 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.472987e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.474035e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.474035e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.639693 sec - 23,332,846,989 cycles:u # 3.502 GHz (74.98%) - 1,288,164 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 2,737,407,629 stalled-cycles-backend:u # 11.73% backend cycles idle (75.02%) - 75,874,614,222 instructions:u # 3.25 insn per cycle - # 0.04 stalled cycles per insn (75.03%) - 6.664462835 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.088417e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.089483e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.089483e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 7.863905 sec + 24,222,682,650 cycles # 3.080 GHz + 75,881,880,568 instructions # 3.13 insn per cycle + 7.868677803 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.951853e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.969495e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.969495e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.656113 sec - 5,836,586,779 cycles:u # 3.477 GHz (74.78%) - 717,450 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.90%) - 848,310,772 stalled-cycles-backend:u # 14.53% backend cycles idle (75.11%) - 20,140,599,240 instructions:u # 3.45 insn per cycle - # 0.04 stalled cycles per insn (75.23%) - 1.682172413 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.610137e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.625294e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.625294e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.165952 sec + 6,500,582,172 cycles # 2.998 GHz + 20,124,386,165 instructions # 3.10 insn per cycle + 2.170723762 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.361989e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.372243e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.372243e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.702439 sec - 2,486,852,526 cycles:u # 3.431 GHz (74.62%) - 558,544 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.62%) - 304,098,180 stalled-cycles-backend:u # 12.23% backend cycles idle (74.71%) - 7,106,951,943 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (75.22%) - 0.728850903 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.710509e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.717516e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.717516e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.969667 sec + 2,825,954,460 cycles # 2.903 GHz + 7,046,906,721 instructions # 2.49 insn per cycle + 0.974421373 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.939035e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.948865e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.948865e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.856524 sec + 2,486,142,670 cycles # 2.890 GHz + 6,288,929,352 instructions # 2.53 insn per cycle + 0.861361903 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.549562e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.555377e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.555377e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.069429 sec + 2,044,889,978 cycles # 1.905 GHz + 3,257,809,276 instructions # 1.59 insn per cycle + 1.074199449 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 1a81f08f91..881037651a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,182 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_19:21:56 +DATE: 2024-02-05_22:10:40 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.514466e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.765340e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.766977e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.439790 sec - 1,244,201,437 cycles:u # 2.707 GHz (74.39%) - 2,857,674 stalled-cycles-frontend:u # 0.23% frontend cycles idle (73.93%) - 33,629,601 stalled-cycles-backend:u # 2.70% backend cycles idle (74.64%) - 1,575,400,945 instructions:u # 1.27 insn per cycle - # 0.02 stalled cycles per insn (76.06%) - 0.481133771 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.337925e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.389812e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.395257e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 +TOTAL : 0.462308 sec + 2,034,668,664 cycles # 3.023 GHz + 3,018,996,605 instructions # 1.48 insn per cycle + 0.730931017 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.691757e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.728443e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.728888e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.306610 sec - 11,100,655,015 cycles:u # 3.333 GHz (75.02%) - 28,009,471 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.04%) - 1,145,966,139 stalled-cycles-backend:u # 10.32% backend cycles idle (75.07%) - 9,001,507,576 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (75.06%) - 3.353666898 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.577816e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.651460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.654751e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 +TOTAL : 1.794466 sec + 6,241,889,186 cycles # 3.071 GHz + 13,099,547,835 instructions # 2.10 insn per cycle + 2.089963432 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.476067e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.477134e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.477134e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.114831e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.115954e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.115954e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.629405 sec - 23,294,577,294 cycles:u # 3.503 GHz (74.98%) - 1,353,787 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 2,744,428,038 stalled-cycles-backend:u # 11.78% backend cycles idle (74.98%) - 75,897,993,100 instructions:u # 3.26 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 6.652333857 seconds time elapsed +TOTAL : 7.764258 sec + 24,215,823,633 cycles # 3.118 GHz + 75,876,946,025 instructions # 3.13 insn per cycle + 7.768790583 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.953957e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.971733e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.971733e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.653575 sec - 5,843,689,539 cycles:u # 3.489 GHz (74.72%) - 730,607 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.91%) - 846,579,829 stalled-cycles-backend:u # 14.49% backend cycles idle (75.12%) - 20,140,448,269 instructions:u # 3.45 insn per cycle - # 0.04 stalled cycles per insn (75.17%) - 1.676778995 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.487445e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.500995e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.500995e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 2.199553 sec + 6,495,792,958 cycles # 2.953 GHz + 20,114,445,952 instructions # 3.10 insn per cycle + 2.204072119 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.352801e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.363073e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.363073e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.703244 sec - 2,498,771,333 cycles:u # 3.446 GHz (74.63%) - 581,092 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.63%) - 306,067,701 stalled-cycles-backend:u # 12.25% backend cycles idle (74.63%) - 7,101,942,909 instructions:u # 2.84 insn per cycle - # 0.04 stalled cycles per insn (74.82%) - 0.726696573 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.699836e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.707208e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.707208e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 +TOTAL : 0.974730 sec + 2,823,184,438 cycles # 2.886 GHz + 7,036,774,400 instructions # 2.49 insn per cycle + 0.979278056 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.944843e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953999e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953999e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 +TOTAL : 0.852312 sec + 2,479,942,344 cycles # 2.897 GHz + 6,275,567,381 instructions # 2.53 insn per cycle + 0.856933859 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.563188e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.569271e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.569271e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 1.058689 sec + 2,038,090,504 cycles # 1.919 GHz + 3,244,115,296 instructions # 1.59 insn per cycle + 1.063236251 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index b29358c30c..5ba28b310e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,187 +1,227 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_19:18:13 +DATE: 2024-02-05_22:04:05 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.589313e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.760043e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.761719e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 -TOTAL : 0.442297 sec - 1,270,659,451 cycles:u # 2.722 GHz (73.76%) - 3,440,593 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.34%) - 34,153,287 stalled-cycles-backend:u # 2.69% backend cycles idle (74.91%) - 1,612,304,356 instructions:u # 1.27 insn per cycle - # 0.02 stalled cycles per insn (76.39%) - 0.485585677 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.740544e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.404954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.410693e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.466537 sec + 1,999,385,948 cycles # 2.951 GHz + 3,067,244,681 instructions # 1.53 insn per cycle + 0.734843083 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.295674e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.723764e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.724198e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.417280 sec - 11,488,321,170 cycles:u # 3.329 GHz (75.04%) - 39,406,344 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.04%) - 1,148,055,081 stalled-cycles-backend:u # 9.99% backend cycles idle (74.97%) - 9,911,157,892 instructions:u # 0.86 insn per cycle - # 0.12 stalled cycles per insn (74.95%) - 3.471056448 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.501615e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.641152e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.644324e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.819126 sec + 6,263,393,299 cycles # 3.061 GHz + 13,206,274,346 instructions # 2.11 insn per cycle + 2.110646534 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.471309e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.472359e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.472359e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.642241 sec - 23,339,711,815 cycles:u # 3.502 GHz (74.95%) - 1,451,091 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 2,749,255,146 stalled-cycles-backend:u # 11.78% backend cycles idle (75.03%) - 75,875,175,331 instructions:u # 3.25 insn per cycle - # 0.04 stalled cycles per insn (75.03%) - 6.666216415 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.077876e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.078899e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.078899e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 7.901308 sec + 24,218,143,686 cycles # 3.064 GHz + 75,878,622,530 instructions # 3.13 insn per cycle + 7.905943320 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.963885e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.981548e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.981548e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.652129 sec - 5,840,229,414 cycles:u # 3.489 GHz (74.71%) - 711,949 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 842,473,856 stalled-cycles-backend:u # 14.43% backend cycles idle (75.15%) - 20,139,686,739 instructions:u # 3.45 insn per cycle - # 0.04 stalled cycles per insn (75.16%) - 1.675893194 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.608896e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.623255e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.623255e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.163733 sec + 6,487,922,446 cycles # 2.994 GHz + 20,114,268,161 instructions # 3.10 insn per cycle + 2.168256412 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.358784e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.369131e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.369131e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.701451 sec - 2,496,799,364 cycles:u # 3.451 GHz (74.58%) - 541,349 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.58%) - 303,569,480 stalled-cycles-backend:u # 12.16% backend cycles idle (74.66%) - 7,097,254,370 instructions:u # 2.84 insn per cycle - # 0.04 stalled cycles per insn (75.06%) - 0.725439966 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.707369e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.714719e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.714719e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.969111 sec + 2,835,385,661 cycles # 2.915 GHz + 7,037,071,150 instructions # 2.48 insn per cycle + 0.973627782 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.796935e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.805643e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.805643e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.921646 sec + 2,479,592,241 cycles # 2.682 GHz + 6,279,594,335 instructions # 2.53 insn per cycle + 0.926599340 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.571973e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.577972e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.577972e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.051221 sec + 2,033,761,132 cycles # 1.928 GHz + 3,247,291,972 instructions # 1.60 insn per cycle + 1.055684386 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index c47054a2d7..38a19c6467 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_18:26:04 +DATE: 2024-02-05_21:13:31 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.574080e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.764946e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.765663e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.462133 sec - 1,130,126,390 cycles:u # 2.394 GHz (73.37%) - 2,241,615 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.40%) - 5,278,042 stalled-cycles-backend:u # 0.47% backend cycles idle (75.67%) - 1,542,456,049 instructions:u # 1.36 insn per cycle - # 0.00 stalled cycles per insn (76.31%) - 0.510089687 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.299974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.358481e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.364460e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.480230 sec + 2,110,623,004 cycles # 3.014 GHz + 3,051,762,313 instructions # 1.45 insn per cycle + 0.785636939 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.717453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.744677e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.745205e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.637025 sec - 7,807,862,465 cycles:u # 2.935 GHz (74.88%) - 2,445,734 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.86%) - 4,656,932 stalled-cycles-backend:u # 0.06% backend cycles idle (74.86%) - 6,756,353,612 instructions:u # 0.87 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 2.688081893 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.553894e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.626828e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.630131e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.720072 sec + 5,949,004,472 cycles # 3.045 GHz + 11,591,413,648 instructions # 1.95 insn per cycle + 2.011142873 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.175682e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.176601e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.176601e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 7.544535 sec - 23,318,888,360 cycles:u # 3.081 GHz (74.95%) - 1,412,271 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 2,230,475,680 stalled-cycles-backend:u # 9.57% backend cycles idle (74.98%) - 75,827,109,693 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (75.02%) - 7.570599830 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.101524e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.102627e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.102627e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 7.813552 sec + 24,191,703,837 cycles # 3.095 GHz + 75,800,583,581 instructions # 3.13 insn per cycle + 7.820528848 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866108667618E-004 -Relative difference = 5.871505118544242e-08 +Avg ME (F77/C++) = 6.6274870430095556E-004 +Relative difference = 6.489572191632735e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.773713e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.789536e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.789536e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.875876 sec - 5,837,657,312 cycles:u # 3.073 GHz (74.62%) - 745,118 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.81%) - 831,661,751 stalled-cycles-backend:u # 14.25% backend cycles idle (75.16%) - 20,114,759,991 instructions:u # 3.45 insn per cycle - # 0.04 stalled cycles per insn (75.16%) - 1.903414736 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.533729e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.548052e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.548052e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.185486 sec + 6,493,193,059 cycles # 2.965 GHz + 20,110,924,734 instructions # 3.10 insn per cycle + 2.248042621 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.089617e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.098746e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.098746e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.791538 sec - 2,482,634,076 cycles:u # 3.044 GHz (74.62%) - 523,574 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.50%) - 301,565,450 stalled-cycles-backend:u # 12.15% backend cycles idle (74.58%) - 7,085,054,777 instructions:u # 2.85 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 0.818943329 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11569) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.715181e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.722216e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.722216e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.964967 sec + 2,812,074,979 cycles # 2.903 GHz + 7,037,571,648 instructions # 2.50 insn per cycle + 0.980644370 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11587) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.945182e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.954199e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.954199e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.851822 sec + 2,474,162,029 cycles # 2.891 GHz + 6,280,078,187 instructions # 2.54 insn per cycle + 0.864965859 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10302) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.559682e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565547e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565547e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.059915 sec + 2,038,653,616 cycles # 1.915 GHz + 3,247,299,383 instructions # 1.59 insn per cycle + 1.075802468 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2140) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 434d8447ad..5ecd50d8da 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_18:54:16 +DATE: 2024-02-05_21:49:34 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.570012e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.763594e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.765152e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.425460 sec - 1,135,202,481 cycles:u # 2.555 GHz (75.07%) - 2,145,635 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.04%) - 6,038,783 stalled-cycles-backend:u # 0.53% backend cycles idle (74.71%) - 1,550,757,081 instructions:u # 1.37 insn per cycle - # 0.00 stalled cycles per insn (74.70%) - 0.470654879 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.571051e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.616606e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.620923e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.488255 sec + 2,117,936,620 cycles # 2.989 GHz + 3,170,007,418 instructions # 1.50 insn per cycle + 0.770960722 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.703320e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.731268e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.731699e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.614314 sec - 8,769,750,029 cycles:u # 3.322 GHz (74.93%) - 2,537,311 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.14%) - 5,667,727 stalled-cycles-backend:u # 0.06% backend cycles idle (75.03%) - 7,386,308,964 instructions:u # 0.84 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 2.666031947 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.724432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.783959e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.786528e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.852214 sec + 6,378,559,976 cycles # 3.053 GHz + 13,824,910,875 instructions # 2.17 insn per cycle + 2.145918086 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 6.343794e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.344484e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.344484e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 -TOTAL : 25.858345 sec - 90,701,070,690 cycles:u # 3.505 GHz (74.99%) - 534,988,208 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.99%) - 6,752,209,392 stalled-cycles-backend:u # 7.44% backend cycles idle (74.99%) - 134,091,433,811 instructions:u # 1.48 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 25.882815835 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16252) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.881511e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.882353e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.882353e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 27.892796 sec + 85,879,383,305 cycles # 3.079 GHz + 133,985,180,489 instructions # 1.56 insn per cycle + 27.897590626 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275340697351248E-004 -Relative difference = 1.052203199451665e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275354356437610E-004 +Relative difference = 6.573239683366044e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.093528e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.105198e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.105198e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 2.032623 sec - 7,167,162,470 cycles:u # 3.488 GHz (74.98%) - 6,640,055 stalled-cycles-frontend:u # 0.09% frontend cycles idle (75.09%) - 3,073,494,724 stalled-cycles-backend:u # 42.88% backend cycles idle (75.09%) - 19,179,691,319 instructions:u # 2.68 insn per cycle - # 0.16 stalled cycles per insn (75.09%) - 2.058490331 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.083048e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.096099e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.096099e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.328040 sec + 6,719,869,961 cycles # 2.885 GHz + 19,163,430,307 instructions # 2.85 insn per cycle + 2.333347546 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857053714997E-004 -Relative difference = 4.445554471174176e-08 +Avg ME (F77/C++) = 6.6274859783433532E-004 +Relative difference = 3.2677016209485094e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.480499e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.484527e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.484527e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.114030 sec - 3,937,359,669 cycles:u # 3.465 GHz (74.66%) - 599,044 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.71%) - 2,266,035,223 stalled-cycles-backend:u # 57.55% backend cycles idle (75.07%) - 6,767,958,144 instructions:u # 1.72 insn per cycle - # 0.33 stalled cycles per insn (75.37%) - 1.139732435 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48607) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.523354e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.528989e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.528989e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.085366 sec + 3,139,497,739 cycles # 2.883 GHz + 6,746,894,389 instructions # 2.15 insn per cycle + 1.090164339 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48625) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735722101156E-004 -Relative difference = 6.454990161554483e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724143469353E-004 +Relative difference = 6.252149235286529e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.851486e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.859999e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.859999e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.894209 sec + 2,606,414,605 cycles # 2.903 GHz + 5,930,989,921 instructions # 2.28 insn per cycle + 0.899025453 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42219) (512y: 24) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724143469353E-004 +Relative difference = 6.252149235286529e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.546446e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.552539e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.552539e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.069126 sec + 2,048,063,476 cycles # 1.909 GHz + 3,435,686,018 instructions # 1.68 insn per cycle + 1.073902025 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4188) (512y: 9) (512z:44489) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272748295826550E-004 +Relative difference = 2.5714542480216212e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 4c84e01f99..b0f7a52d59 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_18:55:05 +DATE: 2024-02-05_21:50:25 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.568788e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.766183e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.768168e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.423804 sec - 1,150,029,388 cycles:u # 2.592 GHz (75.47%) - 2,123,880 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.93%) - 5,213,875 stalled-cycles-backend:u # 0.45% backend cycles idle (75.57%) - 1,542,493,709 instructions:u # 1.34 insn per cycle - # 0.00 stalled cycles per insn (75.30%) - 0.472356355 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.540170e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.582001e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.586495e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.484380 sec + 2,126,942,389 cycles # 3.017 GHz + 3,183,788,929 instructions # 1.50 insn per cycle + 0.765419001 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.723431e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.750359e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.750885e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.609005 sec - 8,745,463,980 cycles:u # 3.319 GHz (74.88%) - 2,568,646 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.12%) - 5,702,914 stalled-cycles-backend:u # 0.07% backend cycles idle (75.13%) - 7,426,953,439 instructions:u # 0.85 insn per cycle - # 0.00 stalled cycles per insn (75.10%) - 2.662163005 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.694061e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.754237e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.756851e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.856995 sec + 6,389,024,738 cycles # 3.056 GHz + 13,438,339,449 instructions # 2.10 insn per cycle + 2.150054585 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 6.227875e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.228540e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.228540e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 -TOTAL : 26.339306 sec - 92,374,923,336 cycles:u # 3.504 GHz (75.00%) - 438,465,384 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.99%) - 6,550,802,401 stalled-cycles-backend:u # 7.09% backend cycles idle (74.99%) - 133,996,906,633 instructions:u # 1.45 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 26.363927936 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16105) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.892751e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.893597e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.893597e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 27.839605 sec + 85,632,537,332 cycles # 3.076 GHz + 134,110,079,741 instructions # 1.57 insn per cycle + 27.844551025 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16109) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275346486299042E-004 -Relative difference = 5.301670926116898e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627536e-04 +Avg ME (F77/C++) = 6.6275357377482830E-004 +Relative difference = 3.95700176737784e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.448607e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.461579e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.461579e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 1.947250 sec - 6,863,113,366 cycles:u # 3.484 GHz (74.82%) - 737,553 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.74%) - 3,334,442,234 stalled-cycles-backend:u # 48.58% backend cycles idle (74.95%) - 19,238,579,959 instructions:u # 2.80 insn per cycle - # 0.17 stalled cycles per insn (75.23%) - 1.973159628 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.422115e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.436822e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.436822e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.217876 sec + 6,734,489,864 cycles # 3.031 GHz + 19,223,110,429 instructions # 2.85 insn per cycle + 2.222784192 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857044990032E-004 -Relative difference = 4.4587192899226015e-08 +Avg ME (F77/C++) = 6.6274859765498573E-004 +Relative difference = 3.538316437387639e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.500626e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.504745e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.504745e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.099013 sec - 3,884,604,581 cycles:u # 3.463 GHz (75.04%) - 1,235,591 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.04%) - 2,198,119,202 stalled-cycles-backend:u # 56.59% backend cycles idle (75.05%) - 6,705,713,189 instructions:u # 1.73 insn per cycle - # 0.33 stalled cycles per insn (75.05%) - 1.129463927 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47398) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.435689e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.441032e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.441032e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.151356 sec + 3,077,944,189 cycles # 2.664 GHz + 6,686,500,879 instructions # 2.17 insn per cycle + 1.156273729 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47416) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735755491807E-004 -Relative difference = 6.404606472340801e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724133897148E-004 +Relative difference = 6.237705578619894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.696521e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.704211e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.704211e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.975502 sec + 2,608,790,145 cycles # 2.663 GHz + 5,935,625,436 instructions # 2.28 insn per cycle + 0.980501381 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41564) (512y: 18) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724133897148E-004 +Relative difference = 6.237705578619894e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.371523e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.376383e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.376383e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.204978 sec + 2,046,707,084 cycles # 1.693 GHz + 3,423,438,147 instructions # 1.67 insn per cycle + 1.210209313 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3375) (512y: 11) (512z:43966) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272749650985591E-004 +Relative difference = 5.26633351741962e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 21eb60d64b..5e76674d00 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_18:26:34 +DATE: 2024-02-05_21:14:00 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.383338e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.553589e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.555084e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.663207 sec - 1,784,805,123 cycles:u # 2.621 GHz (73.34%) - 2,371,286 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.76%) - 5,590,912 stalled-cycles-backend:u # 0.31% backend cycles idle (75.35%) - 2,055,029,836 instructions:u # 1.15 insn per cycle - # 0.00 stalled cycles per insn (75.35%) - 0.710045389 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.465947e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.494792e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.497225e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.524953 sec + 2,256,489,010 cycles # 2.939 GHz + 3,447,539,579 instructions # 1.53 insn per cycle + 0.836200977 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.248062e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.250593e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.250651e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 7.718381 sec - 23,348,642,052 cycles:u # 3.022 GHz (74.93%) - 3,266,528 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) - 6,116,139 stalled-cycles-backend:u # 0.03% backend cycles idle (75.06%) - 18,753,652,615 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 7.772673424 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.125742e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.159824e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.161295e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.034549 sec + 10,082,494,977 cycles # 3.063 GHz + 22,355,930,391 instructions # 2.22 insn per cycle + 3.348702480 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.898017e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.898749e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.898749e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.649331 sec - 26,772,745,603 cycles:u # 3.087 GHz (74.99%) - 57,039,124 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.00%) - 3,873,693,774 stalled-cycles-backend:u # 14.47% backend cycles idle (75.00%) - 82,448,956,627 instructions:u # 3.08 insn per cycle - # 0.05 stalled cycles per insn (75.00%) - 8.675961875 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.884772e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.885701e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.885701e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.711657 sec + 26,806,493,916 cycles # 3.076 GHz + 82,459,435,426 instructions # 3.08 insn per cycle + 8.718797714 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.502956e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.507031e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.507031e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.650850 sec - 11,298,807,934 cycles:u # 3.075 GHz (74.99%) - 3,701,304 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) - 1,198,952,375 stalled-cycles-backend:u # 10.61% backend cycles idle (74.97%) - 38,525,344,308 instructions:u # 3.41 insn per cycle - # 0.03 stalled cycles per insn (74.97%) - 3.678443473 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.779319e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.782699e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.782699e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.349718 sec + 12,635,010,271 cycles # 2.902 GHz + 38,537,183,996 instructions # 3.05 insn per cycle + 4.365144686 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.075722e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.077981e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.077981e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.533338 sec - 4,767,692,102 cycles:u # 3.058 GHz (74.84%) - 875,813 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.86%) - 518,076,988 stalled-cycles-backend:u # 10.87% backend cycles idle (74.86%) - 13,617,685,988 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (74.73%) - 1.597738771 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.648503e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.666677e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.666677e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.906583 sec + 5,531,649,242 cycles # 2.895 GHz + 13,584,392,935 instructions # 2.46 insn per cycle + 1.919917908 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10944) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.700993e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.723114e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.723114e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.700317 sec + 4,941,121,910 cycles # 2.899 GHz + 12,109,565,477 instructions # 2.45 insn per cycle + 1.711862121 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9682) (512y: 76) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.716895e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.731839e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.731839e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.134730 sec + 4,095,106,977 cycles # 1.915 GHz + 6,282,362,404 instructions # 1.53 insn per cycle + 2.148505489 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1528) (512y: 76) (512z: 9010) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index d643daf349..562f5e1d4c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-08_18:27:12 +DATE: 2024-02-05_21:14:37 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.384844e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.440698e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.441232e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.532172 sec - 1,450,165,108 cycles:u # 2.635 GHz (73.15%) - 2,344,129 stalled-cycles-frontend:u # 0.16% frontend cycles idle (73.67%) - 4,952,838 stalled-cycles-backend:u # 0.34% backend cycles idle (75.78%) - 1,799,622,271 instructions:u # 1.24 insn per cycle - # 0.00 stalled cycles per insn (75.69%) - 0.578640825 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.477704e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.507523e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.510116e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.527379 sec + 2,175,766,055 cycles # 2.841 GHz + 3,418,769,839 instructions # 1.57 insn per cycle + 0.834947605 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.732714e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.737520e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.737629e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 6.386833 sec - 19,706,454,901 cycles:u # 3.074 GHz (74.97%) - 3,130,006 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.05%) - 5,448,176 stalled-cycles-backend:u # 0.03% backend cycles idle (75.05%) - 15,940,692,639 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.09%) - 6.438658275 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.125116e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.158909e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.160233e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.022032 sec + 10,121,111,635 cycles # 3.087 GHz + 22,821,428,883 instructions # 2.25 insn per cycle + 3.334908174 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.978606e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.979399e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.979399e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.297021 sec - 26,214,200,018 cycles:u # 3.151 GHz (75.00%) - 14,108,761 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.00%) - 3,509,403,049 stalled-cycles-backend:u # 13.39% backend cycles idle (75.00%) - 82,326,114,832 instructions:u # 3.14 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 8.322924348 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.883492e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.884333e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.884333e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.718497 sec + 26,807,367,386 cycles # 3.074 GHz + 82,358,991,278 instructions # 3.07 insn per cycle + 8.725625839 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.594292e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.598635e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.598635e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.578014 sec - 11,347,535,968 cycles:u # 3.151 GHz (74.93%) - 5,170,678 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.90%) - 1,362,756,631 stalled-cycles-backend:u # 12.01% backend cycles idle (74.84%) - 38,598,602,630 instructions:u # 3.40 insn per cycle - # 0.04 stalled cycles per insn (74.95%) - 3.605311382 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.755460e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.758746e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.758746e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.376440 sec + 12,657,723,358 cycles # 2.890 GHz + 38,556,519,238 instructions # 3.05 insn per cycle + 4.389904658 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.107638e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.110113e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.110113e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.488940 sec - 4,782,141,770 cycles:u # 3.162 GHz (74.74%) - 1,620,929 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) - 473,222,387 stalled-cycles-backend:u # 9.90% backend cycles idle (75.15%) - 13,605,640,779 instructions:u # 2.85 insn per cycle - # 0.03 stalled cycles per insn (75.15%) - 1.516094692 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10908) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.672799e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.691336e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.691336e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.901504 sec + 5,498,608,733 cycles # 2.886 GHz + 13,598,345,039 instructions # 2.47 insn per cycle + 1.913319846 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.786330e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.809305e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.809305e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.685546 sec + 4,831,483,004 cycles # 2.859 GHz + 12,121,611,558 instructions # 2.51 insn per cycle + 1.697656178 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9659) (512y: 76) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.724730e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.739490e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.739490e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.136169 sec + 4,091,747,321 cycles # 1.914 GHz + 6,289,952,093 instructions # 1.54 insn per cycle + 2.148137472 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1508) (512y: 76) (512z: 9009) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index c2c8b7de5b..7d3ee494d3 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-08_18:29:20 +DATE: 2024-02-05_21:17:01 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.014738e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.080986e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.081072e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.534892 sec - 28,671,498,376 cycles:u # 3.043 GHz (74.91%) - 3,858,221 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) - 6,926,757 stalled-cycles-backend:u # 0.02% backend cycles idle (75.05%) - 22,688,422,982 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 9.598603768 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.060831e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.061257e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.061386e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.455073 sec + 8,445,387,754 cycles # 3.085 GHz + 17,531,542,705 instructions # 2.08 insn per cycle + 2.853698533 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.562860e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.566041e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.566064e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 8.964154 sec - 27,295,730,208 cycles:u # 3.038 GHz (74.96%) - 3,490,636 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 7,315,078 stalled-cycles-backend:u # 0.03% backend cycles idle (74.99%) - 21,720,146,331 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 9.012369847 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.224911e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.227118e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.227325e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.995006 sec + 13,041,364,906 cycles # 3.027 GHz + 28,190,198,598 instructions # 2.16 insn per cycle + 4.376446903 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.963792e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.964026e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.964026e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.892164 sec - 18,219,567,667 cycles:u # 3.080 GHz (74.98%) - 29,863,517 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.98%) - 2,207,836,365 stalled-cycles-backend:u # 12.12% backend cycles idle (74.98%) - 55,186,598,006 instructions:u # 3.03 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 5.918128422 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.492023e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.492274e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.492274e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.223749 sec + 18,981,815,870 cycles # 3.050 GHz + 55,179,677,185 instructions # 2.91 insn per cycle + 6.230649212 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.981674e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.981786e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.981786e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.665922 sec - 8,287,182,014 cycles:u # 3.082 GHz (75.05%) - 1,561,165 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.01%) - 781,453,679 stalled-cycles-backend:u # 9.43% backend cycles idle (75.01%) - 27,098,042,601 instructions:u # 3.27 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 2.692616653 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.671790e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.671881e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.671881e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.166948 sec + 9,790,094,499 cycles # 3.091 GHz + 27,056,149,583 instructions # 2.76 insn per cycle + 3.181218090 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.713124e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.713709e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.713709e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.118807 sec - 3,604,878,612 cycles:u # 3.160 GHz (74.85%) - 1,062,079 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.76%) - 285,760,489 stalled-cycles-backend:u # 7.93% backend cycles idle (74.76%) - 9,607,206,483 instructions:u # 2.67 insn per cycle - # 0.03 stalled cycles per insn (74.82%) - 1.144369218 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.621678e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.622134e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.622134e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.466253 sec + 4,240,132,958 cycles # 2.890 GHz + 9,565,614,864 instructions # 2.26 insn per cycle + 1.477574402 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.190305e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.190882e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.190882e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.269480 sec + 3,685,978,412 cycles # 2.903 GHz + 8,451,253,639 instructions # 2.29 insn per cycle + 1.282888481 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.621525e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.622079e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.622079e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.466027 sec + 2,777,501,738 cycles # 1.893 GHz + 4,249,530,672 instructions # 1.53 insn per cycle + 1.478225043 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 561061960f..1d5f961b11 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-08_19:12:13 +DATE: 2024-02-05_21:59:26 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.044322e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.045083e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.045083e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.391291 sec - 32,517,683,109 cycles:u # 3.454 GHz (74.98%) - 3,689,842 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) - 7,133,306 stalled-cycles-backend:u # 0.02% backend cycles idle (75.04%) - 25,691,612,283 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 9.440910348 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.064929e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.065860e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.065860e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.360501 sec + 8,279,629,193 cycles # 3.090 GHz + 17,543,064,215 instructions # 2.12 insn per cycle + 2.736536703 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.547888e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.551556e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.551556e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 8.983518 sec - 31,090,612,402 cycles:u # 3.452 GHz (74.95%) - 4,800,576 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.97%) - 53,604,528 stalled-cycles-backend:u # 0.17% backend cycles idle (75.01%) - 24,535,934,860 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.06%) - 9.033098812 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.231256e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.262658e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.262658e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.980652 sec + 13,263,852,327 cycles # 3.083 GHz + 29,000,010,094 instructions # 2.19 insn per cycle + 4.358270161 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.025361e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.025389e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.025389e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.150974 sec - 18,105,899,310 cycles:u # 3.500 GHz (74.95%) - 25,398,193 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.95%) - 2,124,566,094 stalled-cycles-backend:u # 11.73% backend cycles idle (74.95%) - 55,245,010,996 instructions:u # 3.05 insn per cycle - # 0.04 stalled cycles per insn (74.96%) - 5.175179432 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.470778e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.471016e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.471016e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.238189 sec + 18,981,625,757 cycles # 3.042 GHz + 55,182,170,559 instructions # 2.91 insn per cycle + 6.243020322 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.250260e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.250388e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.250388e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.348263 sec - 8,274,094,109 cycles:u # 3.490 GHz (75.03%) - 1,021,812 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) - 777,449,835 stalled-cycles-backend:u # 9.40% backend cycles idle (75.03%) - 27,051,629,562 instructions:u # 3.27 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 2.373845370 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.629833e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.629925e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.629925e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.243616 sec + 9,801,779,820 cycles # 3.019 GHz + 27,057,747,913 instructions # 2.76 insn per cycle + 3.248379444 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.227010e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.227682e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.227682e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.011912 sec - 3,596,339,890 cycles:u # 3.478 GHz (74.76%) - 1,231,609 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.09%) - 273,275,118 stalled-cycles-backend:u # 7.60% backend cycles idle (75.25%) - 9,569,402,426 instructions:u # 2.66 insn per cycle - # 0.03 stalled cycles per insn (75.25%) - 1.037151346 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.634061e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.634495e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.634495e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.458634 sec + 4,233,960,705 cycles # 2.895 GHz + 9,565,082,926 instructions # 2.26 insn per cycle + 1.463398878 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.157153e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.157728e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.157728e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.276005 sec + 3,691,081,174 cycles # 2.884 GHz + 8,450,630,071 instructions # 2.29 insn per cycle + 1.280852479 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.758568e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.759161e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.759161e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.411378 sec + 2,685,131,174 cycles # 1.898 GHz + 4,248,751,291 instructions # 1.58 insn per cycle + 1.415941060 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 06ffd83976..3eccb964a9 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-08_18:31:06 +DATE: 2024-02-05_21:18:04 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.206191e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.211569e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.211681e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.291628 sec - 28,876,587,435 cycles:u # 3.107 GHz (74.94%) - 3,733,262 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 6,377,437 stalled-cycles-backend:u # 0.02% backend cycles idle (74.97%) - 22,892,391,852 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 9.338653724 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.070259e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.070667e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.070770e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.451051 sec + 8,371,838,735 cycles # 3.054 GHz + 17,036,301,719 instructions # 2.03 insn per cycle + 2.850514565 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.566324e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569681e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.569722e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 8.965793 sec - 27,937,142,550 cycles:u # 3.109 GHz (74.98%) - 3,362,502 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 7,180,647 stalled-cycles-backend:u # 0.03% backend cycles idle (74.97%) - 22,159,141,344 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 9.012156381 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.279514e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.281799e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.282039e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.976549 sec + 13,229,619,550 cycles # 3.075 GHz + 30,205,700,049 instructions # 2.28 insn per cycle + 4.358227522 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.247292e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.247538e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.247538e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.714198 sec - 18,040,255,390 cycles:u # 3.145 GHz (74.92%) - 24,591,420 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.96%) - 2,224,872,655 stalled-cycles-backend:u # 12.33% backend cycles idle (75.02%) - 55,158,939,146 instructions:u # 3.06 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 5.739565254 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.497915e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.498146e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.498146e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.221965 sec + 19,147,431,688 cycles # 3.078 GHz + 55,160,387,913 instructions # 2.88 insn per cycle + 6.228667213 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.020985e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.021102e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.021102e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.615679 sec - 8,282,568,893 cycles:u # 3.138 GHz (74.73%) - 978,772 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.72%) - 774,826,118 stalled-cycles-backend:u # 9.35% backend cycles idle (74.99%) - 27,108,501,706 instructions:u # 3.27 insn per cycle - # 0.03 stalled cycles per insn (75.12%) - 2.642868514 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.664813e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.664906e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.664906e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.176500 sec + 9,809,166,525 cycles # 3.085 GHz + 27,064,727,613 instructions # 2.76 insn per cycle + 3.187515497 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.624971e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.625557e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.625557e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.143999 sec - 3,648,039,246 cycles:u # 3.125 GHz (74.74%) - 2,783,303 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.65%) - 339,048,447 stalled-cycles-backend:u # 9.29% backend cycles idle (74.71%) - 9,622,788,226 instructions:u # 2.64 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 1.171053302 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84231) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.612667e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.613093e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.613093e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.470168 sec + 4,255,189,971 cycles # 2.893 GHz + 9,569,385,070 instructions # 2.25 insn per cycle + 1.482181474 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84249) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.143983e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.144622e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.144622e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.282751 sec + 3,732,361,171 cycles # 2.906 GHz + 8,454,728,771 instructions # 2.27 insn per cycle + 1.294623526 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79386) (512y: 90) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.739856e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.740406e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.740406e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.420004 sec + 2,682,413,534 cycles # 1.887 GHz + 4,250,779,821 instructions # 1.58 insn per cycle + 1.433022366 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 90) (512z:78289) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 6372071c48..fbcfae640e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-08_18:32:49 +DATE: 2024-02-05_21:19:08 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.834552e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.837886e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.837919e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.256794e-06 +- 4.775721e-07 ) GeV^-6 -TOTAL : 4.468557 sec - 14,183,679,196 cycles:u # 3.160 GHz (74.85%) - 3,326,136 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) - 6,659,579 stalled-cycles-backend:u # 0.05% backend cycles idle (75.02%) - 11,566,659,019 instructions:u # 0.82 insn per cycle - # 0.00 stalled cycles per insn (75.06%) - 4.518838173 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.757763e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.758621e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.758964e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.692068 sec + 5,923,762,775 cycles # 3.061 GHz + 11,779,158,242 instructions # 1.99 insn per cycle + 2.045271179 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.376728e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.395099e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.395255e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.930014e-02 +- 1.363569e-02 ) GeV^-6 -TOTAL : 4.702932 sec - 14,041,255,129 cycles:u # 2.995 GHz (74.91%) - 2,934,354 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) - 7,163,543 stalled-cycles-backend:u # 0.05% backend cycles idle (75.05%) - 11,452,634,634 instructions:u # 0.82 insn per cycle - # 0.00 stalled cycles per insn (74.94%) - 4.750425234 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.306138e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.306904e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.307028e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.920494 sec + 6,750,155,929 cycles # 3.075 GHz + 14,446,579,365 instructions # 2.14 insn per cycle + 2.254837097 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.738728e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.738996e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.738996e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 5.423944 sec - 16,810,814,055 cycles:u # 3.086 GHz (74.91%) - 12,801,190 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.03%) - 1,744,694,220 stalled-cycles-backend:u # 10.38% backend cycles idle (75.03%) - 51,792,221,693 instructions:u # 3.08 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 5.449591756 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.308137e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.308421e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.308421e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.679908 sec + 17,563,953,137 cycles # 3.092 GHz + 51,786,319,770 instructions # 2.95 insn per cycle + 5.686611091 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087414119E-003 -Relative difference = 2.1196409216982896e-08 +Avg ME (F77/C++) = 9.8479612087330436E-003 +Relative difference = 2.119555946686223e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.049652e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.050113e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.050113e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.305602 sec - 4,074,876,790 cycles:u # 3.066 GHz (74.80%) - 785,206 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.73%) - 421,906,256 stalled-cycles-backend:u # 10.35% backend cycles idle (74.58%) - 13,824,045,075 instructions:u # 3.39 insn per cycle - # 0.03 stalled cycles per insn (74.88%) - 1.332713075 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.625271e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.625713e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.625713e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.465861 sec + 4,536,409,404 cycles # 3.093 GHz + 13,759,557,462 instructions # 3.03 insn per cycle + 1.477471378 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.139850e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.142248e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.142248e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.579538 sec - 1,836,301,682 cycles:u # 3.045 GHz (75.01%) - 952,261 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.81%) - 158,183,106 stalled-cycles-backend:u # 8.61% backend cycles idle (74.81%) - 4,853,489,425 instructions:u # 2.64 insn per cycle - # 0.03 stalled cycles per insn (74.81%) - 0.606755150 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.224940e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.226824e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.226824e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.739518 sec + 2,138,196,753 cycles # 2.882 GHz + 4,827,470,131 instructions # 2.26 insn per cycle + 0.750354876 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.523869e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.525949e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.525949e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.709067 sec + 1,880,974,593 cycles # 2.645 GHz + 4,259,949,962 instructions # 2.26 insn per cycle + 0.721507274 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.449771e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.452110e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.452110e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.718076 sec + 1,359,361,418 cycles # 1.894 GHz + 2,148,710,095 instructions # 1.58 insn per cycle + 0.730309288 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index a554157bf3..bc5d9230e1 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-08_19:13:53 +DATE: 2024-02-05_22:00:29 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.853224e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.853633e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.853633e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.935145e-03 +- 4.929588e-03 ) GeV^-6 -TOTAL : 4.453332 sec - 15,223,637,685 cycles:u # 3.400 GHz (75.00%) - 2,771,746 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) - 7,367,332 stalled-cycles-backend:u # 0.05% backend cycles idle (75.01%) - 12,410,544,885 instructions:u # 0.82 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 4.501474464 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.808944e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.810771e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.810771e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 +TOTAL : 1.596030 sec + 5,707,463,906 cycles # 3.053 GHz + 12,247,898,086 instructions # 2.15 insn per cycle + 1.926904704 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.369153e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.384951e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.384951e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.258769e+00 +- 1.256832e+00 ) GeV^-6 -TOTAL : 4.655580 sec - 15,991,009,132 cycles:u # 3.419 GHz (74.87%) - 3,807,736 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.99%) - 47,032,900 stalled-cycles-backend:u # 0.29% backend cycles idle (75.03%) - 12,953,793,598 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 4.700563262 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.304775e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.317135e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.317135e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 +TOTAL : 1.879824 sec + 6,584,846,738 cycles # 3.060 GHz + 14,435,491,289 instructions # 2.19 insn per cycle + 2.211594164 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.107264e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.107296e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107296e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.770157 sec - 16,770,929,930 cycles:u # 3.500 GHz (74.96%) - 12,660,822 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.96%) - 1,893,653,846 stalled-cycles-backend:u # 11.29% backend cycles idle (74.96%) - 51,806,136,580 instructions:u # 3.09 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 4.794448400 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.283170e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.283458e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.283458e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.693374 sec + 17,594,854,342 cycles # 3.089 GHz + 51,786,439,660 instructions # 2.94 insn per cycle + 5.698505906 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087414119E-003 -Relative difference = 2.1196409216982896e-08 +Avg ME (F77/C++) = 9.8479612087330436E-003 +Relative difference = 2.119555946686223e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.589146e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.589670e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.589670e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.152176 sec - 4,092,173,704 cycles:u # 3.486 GHz (74.86%) - 897,315 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.79%) - 383,423,485 stalled-cycles-backend:u # 9.37% backend cycles idle (74.83%) - 13,837,928,802 instructions:u # 3.38 insn per cycle - # 0.03 stalled cycles per insn (74.68%) - 1.177600284 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.591153e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.591586e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.591586e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.476060 sec + 4,558,398,705 cycles # 3.082 GHz + 13,759,164,758 instructions # 3.02 insn per cycle + 1.481085306 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.033165e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.033435e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.033435e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.512821 sec - 1,850,580,257 cycles:u # 3.460 GHz (74.68%) - 801,661 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.58%) - 171,065,874 stalled-cycles-backend:u # 9.24% backend cycles idle (74.58%) - 4,888,673,602 instructions:u # 2.64 insn per cycle - # 0.03 stalled cycles per insn (74.21%) - 0.538169231 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.012684e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.014407e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.014407e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.758517 sec + 2,146,491,468 cycles # 2.816 GHz + 4,826,881,543 instructions # 2.25 insn per cycle + 0.763270552 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.114888e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.117040e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.117040e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.655572 sec + 1,897,429,617 cycles # 2.879 GHz + 4,259,285,185 instructions # 2.24 insn per cycle + 0.660139204 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.527218e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.529426e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.529426e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.707913 sec + 1,351,788,067 cycles # 1.900 GHz + 2,148,014,763 instructions # 1.59 insn per cycle + 0.712542671 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index cf278eddf4..b68e70f1e6 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-08_18:33:57 +DATE: 2024-02-05_21:19:55 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.868765e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.872239e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.872288e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.256794e-06 +- 4.775721e-07 ) GeV^-6 -TOTAL : 4.402395 sec - 13,271,493,404 cycles:u # 3.000 GHz (74.88%) - 2,886,684 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.90%) - 6,550,270 stalled-cycles-backend:u # 0.05% backend cycles idle (75.00%) - 10,885,613,237 instructions:u # 0.82 insn per cycle - # 0.00 stalled cycles per insn (75.06%) - 4.450270074 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.769637e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.770485e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.770746e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.691744 sec + 5,877,714,864 cycles # 3.043 GHz + 12,600,922,025 instructions # 2.14 insn per cycle + 2.042117654 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.376055e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.391546e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.391643e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.930014e-02 +- 1.363569e-02 ) GeV^-6 -TOTAL : 4.669395 sec - 14,084,174,259 cycles:u # 3.005 GHz (74.91%) - 2,801,474 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) - 6,533,267 stalled-cycles-backend:u # 0.05% backend cycles idle (75.09%) - 11,524,020,381 instructions:u # 0.82 insn per cycle - # 0.00 stalled cycles per insn (75.10%) - 4.721310793 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.329038e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.329805e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.329898e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.930264 sec + 6,710,757,777 cycles # 3.034 GHz + 14,507,110,241 instructions # 2.16 insn per cycle + 2.271397744 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.903825e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.904092e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.904092e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 5.336670 sec - 16,857,784,212 cycles:u # 3.145 GHz (74.93%) - 18,781,556 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.94%) - 1,691,660,205 stalled-cycles-backend:u # 10.03% backend cycles idle (74.96%) - 51,778,005,120 instructions:u # 3.07 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 5.362534137 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.294726e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.295036e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.295036e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.691679 sec + 17,566,579,323 cycles # 3.086 GHz + 51,758,460,980 instructions # 2.95 insn per cycle + 5.698578649 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087396841E-003 -Relative difference = 2.119623377106246e-08 +Avg ME (F77/C++) = 9.8479612087313262E-003 +Relative difference = 2.1195385077844924e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.115498e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.115986e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.115986e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.284920 sec - 4,067,296,429 cycles:u # 3.110 GHz (75.00%) - 802,854 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.93%) - 403,620,597 stalled-cycles-backend:u # 9.92% backend cycles idle (74.93%) - 13,801,881,820 instructions:u # 3.39 insn per cycle - # 0.03 stalled cycles per insn (74.93%) - 1.311460175 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.616176e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.616676e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.616676e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.469152 sec + 4,545,584,305 cycles # 3.093 GHz + 13,757,988,516 instructions # 3.03 insn per cycle + 1.485351175 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.225770e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.228365e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.228365e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.573839 sec - 1,834,916,505 cycles:u # 3.073 GHz (74.55%) - 663,425 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.55%) - 164,551,872 stalled-cycles-backend:u # 8.97% backend cycles idle (74.55%) - 4,875,367,626 instructions:u # 2.66 insn per cycle - # 0.03 stalled cycles per insn (74.79%) - 0.600651055 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84775) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.204235e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.206094e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.206094e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.739543 sec + 2,146,737,695 cycles # 2.895 GHz + 4,826,497,019 instructions # 2.25 insn per cycle + 0.752280009 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84793) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.297842e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.300329e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.300329e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.643256 sec + 1,855,486,706 cycles # 2.874 GHz + 4,258,716,994 instructions # 2.30 insn per cycle + 0.655646533 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79978) (512y: 46) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.510694e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.512912e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.512912e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.712983 sec + 1,353,886,497 cycles # 1.900 GHz + 2,147,755,049 instructions # 1.59 insn per cycle + 0.726727549 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2776) (512y: 44) (512z:78501) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index fb9c2f0290..16d218cf03 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-08_18:35:03 +DATE: 2024-02-05_21:20:42 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.648038e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.652945e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.653039e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.695567 sec - 30,205,650,330 cycles:u # 3.109 GHz (74.93%) - 4,340,496 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 6,895,406 stalled-cycles-backend:u # 0.02% backend cycles idle (74.98%) - 23,899,524,019 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 9.745066259 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.694986e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.695598e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.695826e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.201129 sec + 7,429,679,479 cycles # 2.975 GHz + 16,463,728,646 instructions # 2.22 insn per cycle + 2.609456882 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.332582e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.335476e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.335510e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 9.304279 sec - 28,962,713,553 cycles:u # 3.106 GHz (75.01%) - 3,467,787 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 7,678,561 stalled-cycles-backend:u # 0.03% backend cycles idle (74.95%) - 23,027,565,140 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 9.351434608 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.108333e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.108647e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108689e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.408187 sec + 11,495,638,263 cycles # 3.080 GHz + 26,728,549,756 instructions # 2.33 insn per cycle + 3.788711649 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656533E-003 -Relative difference = 3.138524921691728e-07 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 9.154821e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.155049e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.155049e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.759679 sec - 18,254,580,621 cycles:u # 3.157 GHz (74.96%) - 35,463,525 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.96%) - 2,215,087,047 stalled-cycles-backend:u # 12.13% backend cycles idle (74.96%) - 55,428,774,623 instructions:u # 3.04 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 5.784252327 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.471059e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.471318e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.471318e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.240779 sec + 19,232,906,655 cycles # 3.082 GHz + 55,389,629,933 instructions # 2.88 insn per cycle + 6.248170189 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.107602e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.107739e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.107739e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.508053 sec - 7,947,357,636 cycles:u # 3.140 GHz (74.91%) - 1,968,487 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.99%) - 830,230,111 stalled-cycles-backend:u # 10.45% backend cycles idle (75.03%) - 25,895,965,513 instructions:u # 3.26 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 2.534723739 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.635227e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.635341e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.635341e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.239255 sec + 9,369,871,652 cycles # 2.893 GHz + 25,875,855,274 instructions # 2.76 insn per cycle + 3.251872137 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.972373e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.973056e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.973056e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.063681 sec - 3,417,815,869 cycles:u # 3.146 GHz (75.03%) - 1,146,039 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) - 286,110,911 stalled-cycles-backend:u # 8.37% backend cycles idle (74.97%) - 9,138,730,343 instructions:u # 2.67 insn per cycle - # 0.03 stalled cycles per insn (74.97%) - 1.090363703 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83802) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.809685e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.810215e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.810215e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.395288 sec + 4,039,725,857 cycles # 2.895 GHz + 9,120,228,738 instructions # 2.26 insn per cycle + 1.407748801 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83820) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.410989e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.411632e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.411632e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.205151 sec + 3,518,398,005 cycles # 2.916 GHz + 8,030,302,177 instructions # 2.28 insn per cycle + 1.218344654 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79028) (512y: 70) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.913569e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.914240e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.914240e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.360336 sec + 2,599,600,994 cycles # 1.911 GHz + 4,076,381,200 instructions # 1.57 insn per cycle + 1.381113740 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1903) (512y: 70) (512z:78042) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 22fe4f03b0..d6475b63f8 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-08_18:36:49 +DATE: 2024-02-05_21:21:43 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.616335e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.621581e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.621650e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.699058 sec - 29,598,806,729 cycles:u # 3.045 GHz (74.95%) - 3,576,423 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 7,747,683 stalled-cycles-backend:u # 0.03% backend cycles idle (74.99%) - 23,405,537,654 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 9.749031776 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.694409e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.695049e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.695333e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.172258 sec + 7,378,666,393 cycles # 2.961 GHz + 16,226,225,965 instructions # 2.20 insn per cycle + 2.560679655 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.344020e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.347210e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.347244e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 9.276482 sec - 28,308,931,845 cycles:u # 3.045 GHz (74.90%) - 3,414,978 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.90%) - 7,329,285 stalled-cycles-backend:u # 0.03% backend cycles idle (74.99%) - 22,421,621,422 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 9.323740295 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.109841e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.110156e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110184e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.405673 sec + 11,489,330,658 cycles # 3.080 GHz + 26,309,022,266 instructions # 2.29 insn per cycle + 3.786697195 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656533E-003 -Relative difference = 3.138524921691728e-07 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 8.988779e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.989048e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.989048e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.875618 sec - 18,246,421,245 cycles:u # 3.093 GHz (74.92%) - 26,708,145 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.93%) - 2,188,827,775 stalled-cycles-backend:u # 12.00% backend cycles idle (75.00%) - 55,417,603,916 instructions:u # 3.04 insn per cycle - # 0.04 stalled cycles per insn (75.05%) - 5.901493860 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.484297e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.484561e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.484561e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.228499 sec + 19,192,891,143 cycles # 3.080 GHz + 55,417,637,865 instructions # 2.89 insn per cycle + 6.232958003 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.109158e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.109282e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.109282e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.506347 sec - 7,931,256,027 cycles:u # 3.136 GHz (74.96%) - 1,981,078 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) - 850,969,438 stalled-cycles-backend:u # 10.73% backend cycles idle (75.02%) - 25,831,043,266 instructions:u # 3.26 insn per cycle - # 0.03 stalled cycles per insn (75.02%) - 2.533258870 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.643545e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.643648e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.643648e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.218830 sec + 9,334,199,176 cycles # 2.897 GHz + 25,822,511,752 instructions # 2.77 insn per cycle + 3.230562767 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.969298e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.969935e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.969935e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.060730 sec - 3,390,200,214 cycles:u # 3.131 GHz (74.89%) - 683,003 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.89%) - 278,832,981 stalled-cycles-backend:u # 8.22% backend cycles idle (74.89%) - 9,099,432,389 instructions:u # 2.68 insn per cycle - # 0.03 stalled cycles per insn (74.92%) - 1.086316685 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83360) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.834505e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.834980e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.834980e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.382141 sec + 4,015,593,467 cycles # 2.897 GHz + 9,098,984,041 instructions # 2.27 insn per cycle + 1.393724809 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.422794e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.423431e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.423431e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.198884 sec + 3,486,499,273 cycles # 2.899 GHz + 8,010,159,141 instructions # 2.30 insn per cycle + 1.211575244 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:78540) (512y: 70) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.921332e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.921975e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.921975e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.351774 sec + 2,598,841,557 cycles # 1.917 GHz + 4,065,366,264 instructions # 1.56 insn per cycle + 1.362524517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1420) (512y: 70) (512z:78026) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 0b3887f883..3f81b13e98 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-08_18:27:49 +DATE: 2024-02-05_21:15:14 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.669570e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.293826e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.681058e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.446507 sec + 1,970,176,956 cycles # 2.985 GHz + 2,781,026,447 instructions # 1.41 insn per cycle + 0.734838981 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 678,458,595 cycles:u # 1.950 GHz (76.80%) - 2,096,593 stalled-cycles-frontend:u # 0.31% frontend cycles idle (75.56%) - 4,777,999 stalled-cycles-backend:u # 0.70% backend cycles idle (74.58%) - 1,192,050,273 instructions:u # 1.76 insn per cycle - # 0.00 stalled cycles per insn (75.86%) - 0.419347421 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 914,884,219 cycles:u # 1.986 GHz (73.94%) - 2,206,448 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.90%) - 4,965,762 stalled-cycles-backend:u # 0.54% backend cycles idle (75.65%) - 1,456,857,476 instructions:u # 1.59 insn per cycle - # 0.00 stalled cycles per insn (72.57%) - 0.486671081 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.263975e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.148170e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.555265e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.529208 sec + 2,301,609,838 cycles # 3.000 GHz + 3,254,587,662 instructions # 1.41 insn per cycle + 0.824893939 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6939f20) on address 0x1543b4ed9000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x15464a241dbf in ??? -#1 0x15464a241d2b in ??? -#2 0x15464a2433e4 in ??? -#3 0x154642714b64 in ??? -#4 0x154642711b38 in ??? -#5 0x1546426cf496 in ??? -#6 0x15464a1db6e9 in ??? -#7 0x15464a30f49e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.049511e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.067308e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.067308e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.587472 sec - 4,990,333,752 cycles:u # 3.099 GHz (74.56%) - 2,680,458 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.80%) - 666,047,617 stalled-cycles-backend:u # 13.35% backend cycles idle (75.17%) - 13,821,217,940 instructions:u # 2.77 insn per cycle - # 0.05 stalled cycles per insn (75.17%) - 1.612934376 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.056266e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.077334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.077334e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.573129 sec + 4,877,263,280 cycles # 3.093 GHz + 13,800,372,792 instructions # 2.83 insn per cycle + 1.580008121 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x63f5c0) on address 0x1482ae5e9000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.047638e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.125957e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.125957e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.821495 sec + 2,560,095,543 cycles # 3.099 GHz + 7,400,936,297 instructions # 2.89 insn per cycle + 0.836848289 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.404910e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.625111e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.625111e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.502830 sec + 1,476,855,789 cycles # 2.911 GHz + 3,136,939,664 instructions # 2.12 insn per cycle + 0.513981856 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.875594e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.165680e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.165680e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.444549 sec + 1,308,091,426 cycles # 2.913 GHz + 2,923,486,765 instructions # 2.23 insn per cycle + 0.459054653 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.700738e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.840729e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.840729e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.630385 sec + 1,264,655,111 cycles # 1.992 GHz + 1,899,913,388 instructions # 1.50 insn per cycle + 0.642819266 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 0df8842357..55c8eeafda 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,119 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-08_19:10:40 +DATE: 2024-02-05_21:57:45 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.677163e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.149342e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.149342e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.469301 sec + 2,062,711,084 cycles # 3.004 GHz + 3,076,481,136 instructions # 1.49 insn per cycle + 0.744248998 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 728,182,078 cycles:u # 2.092 GHz (75.39%) - 2,800,235 stalled-cycles-frontend:u # 0.38% frontend cycles idle (76.02%) - 39,838,991 stalled-cycles-backend:u # 5.47% backend cycles idle (74.83%) - 1,290,090,050 instructions:u # 1.77 insn per cycle - # 0.03 stalled cycles per insn (73.58%) - 0.375238581 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 2,975,558,509 cycles:u # 2.720 GHz (75.17%) - 30,279,059 stalled-cycles-frontend:u # 1.02% frontend cycles idle (75.00%) - 868,671,193 stalled-cycles-backend:u # 29.19% backend cycles idle (73.95%) - 3,189,244,807 instructions:u # 1.07 insn per cycle - # 0.27 stalled cycles per insn (74.42%) - 1.123357973 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.258171e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.292526e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.292526e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.749639 sec + 3,027,636,992 cycles # 3.008 GHz + 4,599,039,489 instructions # 1.52 insn per cycle + 1.064296399 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6939f20) on address 0x15053b5e9000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x1507d095cdbf in ??? -#1 0x1507d095cd2b in ??? -#2 0x1507d095e3e4 in ??? -#3 0x1507c8e2fb64 in ??? -#4 0x1507c8e2cb38 in ??? -#5 0x1507c8dea496 in ??? -#6 0x1507d08f66e9 in ??? -#7 0x1507d0a2a49e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.178600e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198668e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.198668e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.417813 sec - 4,995,418,214 cycles:u # 3.468 GHz (75.01%) - 2,349,825 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) - 664,541,068 stalled-cycles-backend:u # 13.30% backend cycles idle (75.01%) - 13,818,530,399 instructions:u # 2.77 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 1.442715547 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.051692e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.072651e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.072651e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.584692 sec + 4,904,139,806 cycles # 3.087 GHz + 13,805,381,065 instructions # 2.82 insn per cycle + 1.589628039 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x63f5c0) on address 0x147118409000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.020153e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.098786e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.098786e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.839714 sec + 2,595,460,882 cycles # 3.078 GHz + 7,449,928,495 instructions # 2.87 insn per cycle + 0.844799137 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.334523e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.553039e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.553039e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.521012 sec + 1,512,179,612 cycles # 2.880 GHz + 3,186,720,547 instructions # 2.11 insn per cycle + 0.525991705 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.523854e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.791701e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.791701e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.495383 sec + 1,347,648,083 cycles # 2.721 GHz + 2,975,110,368 instructions # 2.21 insn per cycle + 0.500624015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.655028e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.787748e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.787748e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.647387 sec + 1,294,117,885 cycles # 1.986 GHz + 1,936,924,916 instructions # 1.50 insn per cycle + 0.652437474 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 097c028fef..a69eb870b8 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-08_18:28:05 +DATE: 2024-02-05_21:15:32 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.651751e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.189507e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.538308e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.445375 sec + 2,003,165,320 cycles # 3.004 GHz + 2,827,025,873 instructions # 1.41 insn per cycle + 0.738979058 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault - 698,865,952 cycles:u # 1.944 GHz (73.43%) - 2,244,464 stalled-cycles-frontend:u # 0.32% frontend cycles idle (72.12%) - 4,596,251 stalled-cycles-backend:u # 0.66% backend cycles idle (74.71%) - 1,218,556,671 instructions:u # 1.74 insn per cycle - # 0.00 stalled cycles per insn (76.73%) - 0.390528278 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault - 888,738,105 cycles:u # 1.956 GHz (74.02%) - 2,149,629 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.72%) - 4,210,700 stalled-cycles-backend:u # 0.47% backend cycles idle (75.23%) - 1,445,144,664 instructions:u # 1.63 insn per cycle - # 0.00 stalled cycles per insn (73.05%) - 0.480331343 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.236829e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.009410e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.414016e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.528660 sec + 2,299,723,777 cycles # 3.006 GHz + 3,273,869,892 instructions # 1.42 insn per cycle + 0.825223447 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6939e70) on address 0x149b09bf9000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x149d9ef6bdbf in ??? -#1 0x149d9ef6bd2b in ??? -#2 0x149d9ef6d3e4 in ??? -#3 0x149d9743eb64 in ??? -#4 0x149d9743bb38 in ??? -#5 0x149d973f9496 in ??? -#6 0x149d9ef056e9 in ??? -#7 0x149d9f03949e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.053182e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.071027e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.071027e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.581312 sec - 4,982,835,902 cycles:u # 3.107 GHz (74.93%) - 2,582,687 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.06%) - 861,677,452 stalled-cycles-backend:u # 17.29% backend cycles idle (75.06%) - 13,811,263,039 instructions:u # 2.77 insn per cycle - # 0.06 stalled cycles per insn (75.07%) - 1.606148924 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.957243e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.016444e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.016444e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.668980 sec + 4,884,718,098 cycles # 2.919 GHz + 13,808,181,391 instructions # 2.83 insn per cycle + 1.676281445 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x63e260) on address 0x14a85cf29000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.036799e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.115591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.115591e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.825393 sec + 2,561,451,557 cycles # 3.087 GHz + 7,406,342,161 instructions # 2.89 insn per cycle + 0.839309204 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2892) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.385477e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.609605e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.609605e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.505702 sec + 1,479,487,536 cycles # 2.900 GHz + 3,137,175,164 instructions # 2.12 insn per cycle + 0.519573265 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2875) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.870393e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.157477e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.157477e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.444867 sec + 1,304,235,883 cycles # 2.904 GHz + 2,924,972,743 instructions # 2.24 insn per cycle + 0.460522710 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 93) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.714778e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.852940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.852940e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.626914 sec + 1,263,639,141 cycles # 2.001 GHz + 1,899,641,042 instructions # 1.50 insn per cycle + 0.637674529 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1118) (512y: 62) (512z: 2165) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index a0256eaec7..a421dad089 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-08_18:28:20 +DATE: 2024-02-05_21:15:50 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.317512e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.217111e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.357151e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.441355 sec + 1,952,652,349 cycles # 2.978 GHz + 2,757,723,921 instructions # 1.41 insn per cycle + 0.728980619 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 730,948,845 cycles:u # 2.022 GHz (70.93%) - 2,207,273 stalled-cycles-frontend:u # 0.30% frontend cycles idle (73.75%) - 4,601,417 stalled-cycles-backend:u # 0.63% backend cycles idle (76.40%) - 1,234,849,282 instructions:u # 1.69 insn per cycle - # 0.00 stalled cycles per insn (75.58%) - 0.392194629 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 776,560,467 cycles:u # 1.875 GHz (75.08%) - 2,136,178 stalled-cycles-frontend:u # 0.28% frontend cycles idle (76.59%) - 4,919,267 stalled-cycles-backend:u # 0.63% backend cycles idle (74.22%) - 1,354,534,215 instructions:u # 1.74 insn per cycle - # 0.00 stalled cycles per insn (71.20%) - 0.442976193 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.287167e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.812013e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.961599e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.476575 sec + 2,106,169,764 cycles # 3.003 GHz + 2,971,298,740 instructions # 1.41 insn per cycle + 0.760618216 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6937f40) on address 0x1469473b4000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x146bdc71cdbf in ??? -#1 0x146bdc71cd2b in ??? -#2 0x146bdc71e3e4 in ??? -#3 0x146bd4befb64 in ??? -#4 0x146bd4becb38 in ??? -#5 0x146bd4baa496 in ??? -#6 0x146bdc6b66e9 in ??? -#7 0x146bdc7ea49e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.289085e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.315926e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.315926e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.293913 sec - 4,128,663,439 cycles:u # 3.138 GHz (74.91%) - 2,492,915 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.08%) - 255,992,571 stalled-cycles-backend:u # 6.20% backend cycles idle (75.08%) - 12,617,506,411 instructions:u # 3.06 insn per cycle - # 0.02 stalled cycles per insn (75.08%) - 1.318022009 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.187334e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.215295e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.215295e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.400396 sec + 4,340,193,687 cycles # 3.091 GHz + 12,596,376,231 instructions # 2.90 insn per cycle + 1.407492521 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x61d1a0) on address 0x151cb5004000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.308372e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.534483e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534483e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.515722 sec + 1,591,767,628 cycles # 3.061 GHz + 4,246,687,782 instructions # 2.67 insn per cycle + 0.530129556 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.065695e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.836938e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.836938e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.290244 sec + 849,117,535 cycles # 2.882 GHz + 1,915,632,467 instructions # 2.26 insn per cycle + 0.311212153 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.473801e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.375742e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.375742e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.273964 sec + 780,140,985 cycles # 2.799 GHz + 1,797,931,367 instructions # 2.30 insn per cycle + 0.284454132 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.679551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.143053e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.143053e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.373022 sec + 717,324,753 cycles # 1.900 GHz + 1,287,933,216 instructions # 1.80 insn per cycle + 0.387579974 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 44356807a0..6c6a59a9fb 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,119 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-08_19:10:55 +DATE: 2024-02-05_21:58:03 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.777207e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.054581e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.054581e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 +TOTAL : 0.453058 sec + 1,974,608,872 cycles # 2.986 GHz + 2,903,059,814 instructions # 1.47 insn per cycle + 0.721035655 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 726,549,981 cycles:u # 2.114 GHz (74.42%) - 2,764,701 stalled-cycles-frontend:u # 0.38% frontend cycles idle (75.23%) - 39,282,855 stalled-cycles-backend:u # 5.41% backend cycles idle (75.39%) - 1,223,427,295 instructions:u # 1.68 insn per cycle - # 0.03 stalled cycles per insn (76.73%) - 0.370654317 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 2,939,481,812 cycles:u # 2.907 GHz (73.26%) - 30,604,267 stalled-cycles-frontend:u # 1.04% frontend cycles idle (74.74%) - 845,721,220 stalled-cycles-backend:u # 28.77% backend cycles idle (76.20%) - 3,069,069,064 instructions:u # 1.04 insn per cycle - # 0.28 stalled cycles per insn (75.74%) - 1.037907236 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.250884e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.608433e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.608433e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 +TOTAL : 0.615768 sec + 2,539,106,336 cycles # 2.997 GHz + 3,850,180,903 instructions # 1.52 insn per cycle + 0.905916528 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6937f40) on address 0x1548559fc000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x154aead64dbf in ??? -#1 0x154aead64d2b in ??? -#2 0x154aead663e4 in ??? -#3 0x154ae3237b64 in ??? -#4 0x154ae3234b38 in ??? -#5 0x154ae31f2496 in ??? -#6 0x154aeacfe6e9 in ??? -#7 0x154aeae3249e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.402501e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.431590e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.431590e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.191704 sec - 4,212,711,584 cycles:u # 3.470 GHz (74.96%) - 2,486,175 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.96%) - 287,047,288 stalled-cycles-backend:u # 6.81% backend cycles idle (74.96%) - 12,623,625,232 instructions:u # 3.00 insn per cycle - # 0.02 stalled cycles per insn (74.97%) - 1.216006959 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.186508e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.214056e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.214056e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.403761 sec + 4,352,321,749 cycles # 3.092 GHz + 12,600,670,105 instructions # 2.90 insn per cycle + 1.408621157 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x61d1a0) on address 0x14f8c1b54000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.308333e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.536707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.536707e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.519668 sec + 1,610,612,694 cycles # 3.076 GHz + 4,293,733,776 instructions # 2.67 insn per cycle + 0.524564296 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.070636e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.830253e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.830253e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.293851 sec + 866,435,010 cycles # 2.909 GHz + 1,951,871,013 instructions # 2.25 insn per cycle + 0.298758260 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.668627e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.614860e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.614860e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.269788 sec + 795,276,981 cycles # 2.905 GHz + 1,833,827,446 instructions # 2.31 insn per cycle + 0.274696767 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.025648e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.546457e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.546457e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.352529 sec + 736,964,720 cycles # 2.068 GHz + 1,328,948,572 instructions # 1.80 insn per cycle + 0.357272217 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 3dfd6ebf2e..de231e55ec 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-08_18:28:35 +DATE: 2024-02-05_21:16:07 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.412670e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.199054e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.349763e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.439949 sec + 1,957,075,445 cycles # 2.994 GHz + 2,770,692,821 instructions # 1.42 insn per cycle + 0.720018081 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault - 688,551,808 cycles:u # 1.933 GHz (74.89%) - 2,213,305 stalled-cycles-frontend:u # 0.32% frontend cycles idle (75.58%) - 4,609,268 stalled-cycles-backend:u # 0.67% backend cycles idle (77.47%) - 1,197,231,249 instructions:u # 1.74 insn per cycle - # 0.00 stalled cycles per insn (76.19%) - 0.387263140 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault - 780,842,440 cycles:u # 1.930 GHz (74.65%) - 2,040,560 stalled-cycles-frontend:u # 0.26% frontend cycles idle (76.81%) - 4,988,745 stalled-cycles-backend:u # 0.64% backend cycles idle (75.02%) - 1,352,770,220 instructions:u # 1.73 insn per cycle - # 0.00 stalled cycles per insn (72.27%) - 0.432239511 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.167952e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.776862e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.920498e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.473091 sec + 2,130,521,881 cycles # 3.019 GHz + 3,019,247,264 instructions # 1.42 insn per cycle + 0.763574930 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6937e90) on address 0x14ae22ff4000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14b0b835ddbf in ??? -#1 0x14b0b835dd2b in ??? -#2 0x14b0b835f3e4 in ??? -#3 0x14b0b0830b64 in ??? -#4 0x14b0b082db38 in ??? -#5 0x14b0b07eb496 in ??? -#6 0x14b0b82f76e9 in ??? -#7 0x14b0b842b49e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.286257e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.313322e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.313322e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.296760 sec - 4,140,961,015 cycles:u # 3.138 GHz (74.86%) - 2,651,490 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.12%) - 520,725,746 stalled-cycles-backend:u # 12.57% backend cycles idle (75.15%) - 12,613,422,158 instructions:u # 3.05 insn per cycle - # 0.04 stalled cycles per insn (75.16%) - 1.321948469 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.188596e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.216506e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.216506e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.398310 sec + 4,338,466,293 cycles # 3.094 GHz + 12,587,646,115 instructions # 2.90 insn per cycle + 1.405152993 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x61be40) on address 0x154d7a3a4000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.336768e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.569890e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.569890e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.511225 sec + 1,585,938,113 cycles # 3.076 GHz + 4,241,172,905 instructions # 2.67 insn per cycle + 0.526002477 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3248) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.172477e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.952911e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.952911e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.285227 sec + 847,077,737 cycles # 2.926 GHz + 1,913,660,776 instructions # 2.26 insn per cycle + 0.300443120 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3463) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.733284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.677333e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.677333e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.263291 sec + 776,406,702 cycles # 2.902 GHz + 1,795,697,825 instructions # 2.31 insn per cycle + 0.278345913 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3164) (512y: 15) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.156147e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.704757e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.704757e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.339548 sec + 716,982,371 cycles # 2.084 GHz + 1,286,640,400 instructions # 1.79 insn per cycle + 0.353133541 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1709) (512y: 24) (512z: 2387) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 6f991a88e2..caba5422fa 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-08_18:28:49 +DATE: 2024-02-05_21:16:24 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.665805e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.279968e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.648438e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.442706 sec + 2,029,552,154 cycles # 3.029 GHz + 2,846,442,861 instructions # 1.40 insn per cycle + 0.743425796 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault - 678,448,663 cycles:u # 1.984 GHz (75.54%) - 2,155,466 stalled-cycles-frontend:u # 0.32% frontend cycles idle (74.24%) - 5,437,136 stalled-cycles-backend:u # 0.80% backend cycles idle (72.01%) - 1,241,016,939 instructions:u # 1.83 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 0.390711844 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault - 922,080,460 cycles:u # 1.981 GHz (71.78%) - 2,206,869 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.43%) - 5,156,776 stalled-cycles-backend:u # 0.56% backend cycles idle (76.67%) - 1,354,584,330 instructions:u # 1.47 insn per cycle - # 0.00 stalled cycles per insn (76.01%) - 0.492294582 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.261949e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.123956e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.541939e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.529095 sec + 2,306,980,644 cycles # 3.012 GHz + 3,280,944,770 instructions # 1.42 insn per cycle + 0.823708444 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6939f20) on address 0x14c931589000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14cbc68eddbf in ??? -#1 0x14cbc68edd2b in ??? -#2 0x14cbc68ef3e4 in ??? -#3 0x14cbbedc0b64 in ??? -#4 0x14cbbedbdb38 in ??? -#5 0x14cbbed7b496 in ??? -#6 0x14cbc68876e9 in ??? -#7 0x14cbc69bb49e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.063338e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.081332e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.081332e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.566665 sec - 5,004,100,594 cycles:u # 3.148 GHz (74.84%) - 1,445,039 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.84%) - 839,191,252 stalled-cycles-backend:u # 16.77% backend cycles idle (74.85%) - 13,856,679,231 instructions:u # 2.77 insn per cycle - # 0.06 stalled cycles per insn (74.92%) - 1.592188452 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.060897e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.081794e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.081794e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.566339 sec + 4,891,432,798 cycles # 3.116 GHz + 13,823,965,188 instructions # 2.83 insn per cycle + 1.573214922 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x63f5c0) on address 0x146cb9099000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.033305e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.110362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.110362e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.827032 sec + 2,591,385,710 cycles # 3.116 GHz + 7,349,073,111 instructions # 2.84 insn per cycle + 0.841790047 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2967) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.469754e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.701127e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.701127e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.493958 sec + 1,464,063,754 cycles # 2.938 GHz + 3,084,407,899 instructions # 2.11 insn per cycle + 0.506428666 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3008) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.961658e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.263124e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.263124e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.435846 sec + 1,279,115,887 cycles # 2.905 GHz + 2,873,181,084 instructions # 2.25 insn per cycle + 0.445725915 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2653) (512y: 96) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.608300e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.738304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.738304e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.652237 sec + 1,302,193,887 cycles # 1.983 GHz + 1,914,659,883 instructions # 1.47 insn per cycle + 0.667291833 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 70) (512z: 2164) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 987ac60c0e..dea31763dd 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-08_18:29:05 +DATE: 2024-02-05_21:16:43 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.634850e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.135834e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.502178e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.445383 sec + 2,000,203,957 cycles # 3.007 GHz + 2,795,360,403 instructions # 1.40 insn per cycle + 0.738846057 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault - 726,128,967 cycles:u # 2.120 GHz (73.37%) - 2,161,283 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.05%) - 4,554,612 stalled-cycles-backend:u # 0.63% backend cycles idle (77.69%) - 1,200,673,004 instructions:u # 1.65 insn per cycle - # 0.00 stalled cycles per insn (77.69%) - 0.369447796 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault - 901,851,871 cycles:u # 1.964 GHz (73.55%) - 2,143,838 stalled-cycles-frontend:u # 0.24% frontend cycles idle (76.11%) - 5,320,407 stalled-cycles-backend:u # 0.59% backend cycles idle (75.63%) - 1,381,245,349 instructions:u # 1.53 insn per cycle - # 0.00 stalled cycles per insn (75.73%) - 0.484540471 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.227172e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.969457e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.380914e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.531264 sec + 2,302,864,776 cycles # 2.996 GHz + 3,298,090,495 instructions # 1.43 insn per cycle + 0.827884323 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6939e70) on address 0x1502ead09000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x150580071dbf in ??? -#1 0x150580071d2b in ??? -#2 0x1505800733e4 in ??? -#3 0x150578544b64 in ??? -#4 0x150578541b38 in ??? -#5 0x1505784ff496 in ??? -#6 0x15058000b6e9 in ??? -#7 0x15058013f49e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.065407e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.083513e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.083513e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.563492 sec - 5,002,207,106 cycles:u # 3.151 GHz (74.80%) - 1,731,248 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.80%) - 789,645,029 stalled-cycles-backend:u # 15.79% backend cycles idle (74.66%) - 13,861,853,774 instructions:u # 2.77 insn per cycle - # 0.06 stalled cycles per insn (74.92%) - 1.589695330 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.052323e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.073479e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.073479e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.578545 sec + 4,898,302,542 cycles # 3.095 GHz + 13,831,146,011 instructions # 2.82 insn per cycle + 1.585728014 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x63e260) on address 0x1546044f9000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.996784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.073146e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.073146e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.842613 sec + 2,603,005,292 cycles # 3.074 GHz + 7,352,584,625 instructions # 2.82 insn per cycle + 0.855614959 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2957) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.412508e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.635285e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.635285e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.501624 sec + 1,466,613,920 cycles # 2.898 GHz + 3,084,946,401 instructions # 2.10 insn per cycle + 0.513214103 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2986) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.954805e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.258575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.258575e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.435439 sec + 1,276,676,932 cycles # 2.902 GHz + 2,874,881,412 instructions # 2.25 insn per cycle + 0.450491255 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2636) (512y: 96) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.598476e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.727689e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.727689e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.653930 sec + 1,303,406,976 cycles # 1.980 GHz + 1,915,098,748 instructions # 1.47 insn per cycle + 0.670717403 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1476) (512y: 70) (512z: 2164) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED From cc421be6a83c968524b0393196f574cbfa248a53 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 1 Mar 2024 08:18:48 +0100 Subject: [PATCH 83/96] [susy2] rerun 78 tput tests on itscrd90, all ok STARTED AT Thu Feb 29 11:07:30 PM CET 2024 ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Fri Mar 1 02:43:24 AM CET 2024 [Status=0] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Fri Mar 1 03:09:38 AM CET 2024 [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Fri Mar 1 03:19:22 AM CET 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Fri Mar 1 03:22:39 AM CET 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ENDED(5) AT Fri Mar 1 03:25:53 AM CET 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common ENDED(6) AT Fri Mar 1 03:29:12 AM CET 2024 [Status=0] No errors found in logs --- .../log_eemumu_mad_d_inl0_hrd0.txt | 190 +++++++------- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 190 +++++++------- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 190 +++++++------- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 190 +++++++------- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 190 +++++++------- .../log_eemumu_mad_d_inl0_hrd1.txt | 202 +++++++-------- .../log_eemumu_mad_d_inl1_hrd0.txt | 190 +++++++------- .../log_eemumu_mad_d_inl1_hrd1.txt | 202 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0.txt | 186 +++++++------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 186 +++++++------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 186 +++++++------- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 186 +++++++------- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 186 +++++++------- .../log_eemumu_mad_f_inl0_hrd1.txt | 186 +++++++------- .../log_eemumu_mad_f_inl1_hrd0.txt | 198 +++++++-------- .../log_eemumu_mad_f_inl1_hrd1.txt | 194 +++++++-------- .../log_eemumu_mad_m_inl0_hrd0.txt | 178 +++++++------- .../log_eemumu_mad_m_inl0_hrd1.txt | 178 +++++++------- .../log_ggtt_mad_d_inl0_hrd0.txt | 182 +++++++------- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 182 +++++++------- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 182 +++++++------- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 182 +++++++------- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 182 +++++++------- .../log_ggtt_mad_d_inl0_hrd1.txt | 178 +++++++------- .../log_ggtt_mad_d_inl1_hrd0.txt | 186 +++++++------- .../log_ggtt_mad_d_inl1_hrd1.txt | 178 +++++++------- .../log_ggtt_mad_f_inl0_hrd0.txt | 202 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 202 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 202 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 202 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 202 +++++++-------- .../log_ggtt_mad_f_inl0_hrd1.txt | 202 +++++++-------- .../log_ggtt_mad_f_inl1_hrd0.txt | 202 +++++++-------- .../log_ggtt_mad_f_inl1_hrd1.txt | 202 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0.txt | 190 +++++++------- .../log_ggtt_mad_m_inl0_hrd1.txt | 190 +++++++------- .../log_ggttg_mad_d_inl0_hrd0.txt | 200 +++++++-------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 200 +++++++-------- .../log_ggttg_mad_d_inl0_hrd1.txt | 200 +++++++-------- .../log_ggttg_mad_f_inl0_hrd0.txt | 220 ++++++++--------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 220 ++++++++--------- .../log_ggttg_mad_f_inl0_hrd1.txt | 220 ++++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 200 +++++++-------- .../log_ggttg_mad_m_inl0_hrd1.txt | 200 +++++++-------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 208 ++++++++-------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 208 ++++++++-------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 208 ++++++++-------- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 208 ++++++++-------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 208 ++++++++-------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 208 ++++++++-------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 212 ++++++++-------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 212 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 224 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 224 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 232 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 224 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 224 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 222 ++++++++--------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 228 ++++++++--------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 228 ++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 208 ++++++++-------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 208 ++++++++-------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 196 +++++++-------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 196 +++++++-------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 196 +++++++-------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 226 ++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 226 ++++++++--------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 226 ++++++++--------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 196 +++++++-------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 196 +++++++-------- .../log_gqttq_mad_d_inl0_hrd0.txt | 204 +++++++-------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 204 +++++++-------- .../log_gqttq_mad_d_inl0_hrd1.txt | 204 +++++++-------- .../log_gqttq_mad_f_inl0_hrd0.txt | 218 ++++++++-------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 218 ++++++++-------- .../log_gqttq_mad_f_inl0_hrd1.txt | 218 ++++++++-------- .../log_gqttq_mad_m_inl0_hrd0.txt | 196 +++++++-------- .../log_gqttq_mad_m_inl0_hrd1.txt | 196 +++++++-------- 78 files changed, 7863 insertions(+), 7863 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index bfc258d00c..baa8c044cd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:03:08 +DATE: 2024-03-01_02:23:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.452350e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.294511e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.148428e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.465816e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.330908e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.240172e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.822610 sec - 2,837,805,632 cycles # 3.005 GHz - 4,403,230,931 instructions # 1.55 insn per cycle - 1.161334861 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +TOTAL : 0.907657 sec + 2,864,594,511 cycles # 3.017 GHz + 4,419,491,827 instructions # 1.54 insn per cycle + 1.243823060 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 Avg ME (F77/CUDA) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.064600e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.236923e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.236923e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.117981e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.310106e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.310106e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.301517 sec - 19,469,535,699 cycles # 3.088 GHz - 46,932,585,474 instructions # 2.41 insn per cycle - 6.315848009 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.029383 sec + 18,345,746,310 cycles # 3.041 GHz + 43,971,705,846 instructions # 2.40 insn per cycle + 6.038464488 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.670155e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.187482e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.187482e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.673850e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.186329e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.186329e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.157075 sec - 12,815,071,753 cycles # 3.079 GHz - 31,183,530,054 instructions # 2.43 insn per cycle - 4.175981498 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.154865 sec + 12,823,382,487 cycles # 3.082 GHz + 30,998,172,347 instructions # 2.42 insn per cycle + 4.171623433 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.043386e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.866643e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.866643e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.086690e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.914110e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.914110e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.480200 sec - 10,038,614,023 cycles # 2.880 GHz - 19,479,866,151 instructions # 1.94 insn per cycle - 3.497192381 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +TOTAL : 3.406763 sec + 10,081,289,557 cycles # 2.955 GHz + 19,366,111,959 instructions # 1.92 insn per cycle + 3.427414790 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.206412e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.180520e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.180520e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.191873e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.083636e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.083636e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.242937 sec - 9,601,951,480 cycles # 2.956 GHz - 18,942,365,265 instructions # 1.97 insn per cycle - 3.260567136 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +TOTAL : 3.257696 sec + 9,685,682,355 cycles # 2.968 GHz + 18,976,171,527 instructions # 1.96 insn per cycle + 3.273948471 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.987065e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.720633e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.720633e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.805262e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.408203e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.408203e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.565006 sec - 8,159,998,179 cycles # 2.285 GHz - 15,511,778,574 instructions # 1.90 insn per cycle - 3.584683969 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +TOTAL : 3.888242 sec + 8,621,851,062 cycles # 2.214 GHz + 15,727,334,662 instructions # 1.82 insn per cycle + 3.905958468 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 65910bb431..b9ff72dbf3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:54:50 +DATE: 2024-03-01_03:12:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.680837e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.566605e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.566605e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.687342e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.551417e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.551417e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.218628 sec - 7,529,157,154 cycles # 3.046 GHz - 13,238,923,769 instructions # 1.76 insn per cycle - 2.530894761 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +TOTAL : 2.232505 sec + 7,524,955,995 cycles # 3.041 GHz + 13,468,669,108 instructions # 1.79 insn per cycle + 2.532807464 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,151 +72,151 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 Avg ME (F77/CUDA) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.019513e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.177027e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.177027e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.081573e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.260544e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.260544e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.764382 sec - 20,695,393,450 cycles # 3.057 GHz - 47,159,570,161 instructions # 2.28 insn per cycle - 6.772090237 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.415532 sec + 19,561,606,037 cycles # 3.046 GHz + 44,198,639,919 instructions # 2.26 insn per cycle + 6.422457347 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.579917e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.036956e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.036956e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.552230e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.996603e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.996603e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.580559 sec - 14,078,384,423 cycles # 3.069 GHz - 32,025,612,143 instructions # 2.27 insn per cycle - 4.588192092 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.664054 sec + 13,997,557,946 cycles # 2.998 GHz + 31,841,279,233 instructions # 2.27 insn per cycle + 4.670791737 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.968228e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.695210e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.695210e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.951455e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.660973e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.660973e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.809898 sec - 11,283,308,654 cycles # 2.957 GHz - 20,842,408,471 instructions # 1.85 insn per cycle - 3.817347229 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +TOTAL : 3.823801 sec + 11,324,833,068 cycles # 2.957 GHz + 20,724,775,427 instructions # 1.83 insn per cycle + 3.830534322 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.041776e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.843910e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.843910e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.028218e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.792747e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.792747e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.698169 sec - 10,822,932,969 cycles # 2.922 GHz - 20,302,447,935 instructions # 1.88 insn per cycle - 3.705544033 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +TOTAL : 3.704930 sec + 10,963,593,820 cycles # 2.954 GHz + 20,347,072,159 instructions # 1.86 insn per cycle + 3.711957869 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.801708e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.413060e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.413060e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.747913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.283053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.283053e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.119290 sec - 9,498,000,250 cycles # 2.302 GHz - 16,663,815,127 instructions # 1.75 insn per cycle - 4.126857918 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +TOTAL : 4.214412 sec + 9,956,996,891 cycles # 2.360 GHz + 16,873,658,319 instructions # 1.69 insn per cycle + 4.221168968 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 4ae3af74cc..09aaad1dd8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_22:08:05 +DATE: 2024-03-01_03:26:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.480206e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.597414e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.138890e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.492636e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.583078e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.097014e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.306385 sec - 4,669,803,273 cycles # 3.040 GHz - 7,258,024,375 instructions # 1.55 insn per cycle - 1.593330168 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +TOTAL : 1.329039 sec + 4,626,136,964 cycles # 2.966 GHz + 7,229,705,832 instructions # 1.56 insn per cycle + 1.616136536 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 Avg ME (F77/CUDA) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.064905e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.237097e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.237097e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.120496e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.314160e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.314160e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.649438 sec - 20,601,461,815 cycles # 3.096 GHz - 47,036,177,622 instructions # 2.28 insn per cycle - 6.655735612 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.368910 sec + 19,436,039,687 cycles # 3.050 GHz + 44,075,637,403 instructions # 2.27 insn per cycle + 6.374367735 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.642258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.154032e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.154032e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.684337e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204179e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204179e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.581869 sec - 13,918,566,464 cycles # 3.040 GHz - 31,189,830,309 instructions # 2.24 insn per cycle - 4.588050359 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.477126 sec + 13,840,650,655 cycles # 3.088 GHz + 31,000,398,658 instructions # 2.24 insn per cycle + 4.482579907 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.094130e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.941483e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.941483e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.074274e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.910197e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.910197e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.749910 sec - 11,151,948,172 cycles # 2.970 GHz - 19,381,078,189 instructions # 1.74 insn per cycle - 3.755953575 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +TOTAL : 3.779571 sec + 11,221,356,305 cycles # 2.967 GHz + 19,268,573,834 instructions # 1.72 insn per cycle + 3.784933241 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.184182e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.148835e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.148835e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.174998e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.082449e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.082449e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.634201 sec - 10,721,613,182 cycles # 2.946 GHz - 18,644,581,380 instructions # 1.74 insn per cycle - 3.640332735 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +TOTAL : 3.643336 sec + 10,818,026,445 cycles # 2.966 GHz + 18,676,470,141 instructions # 1.73 insn per cycle + 3.648853496 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.014889e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.782541e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.782541e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.875863e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.507498e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.507498e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.873293 sec - 9,338,000,522 cycles # 2.408 GHz - 15,212,575,344 instructions # 1.63 insn per cycle - 3.879430539 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +TOTAL : 4.111357 sec + 9,725,602,646 cycles # 2.364 GHz + 15,429,502,829 instructions # 1.59 insn per cycle + 4.116843302 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 04f97e0270..c5fdf6f3c6 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_22:04:49 +DATE: 2024-03-01_03:22:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.500482e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.605097e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.137972e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.511929e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.606834e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.132028e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.958819 sec - 3,612,761,264 cycles # 3.040 GHz - 7,190,638,907 instructions # 1.99 insn per cycle - 1.246173317 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +TOTAL : 0.970663 sec + 3,681,129,197 cycles # 3.043 GHz + 7,185,953,404 instructions # 1.95 insn per cycle + 1.266725293 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 Avg ME (F77/CUDA) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.049359e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.218824e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.218824e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.129015e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.325606e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325606e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.388054 sec - 19,467,473,893 cycles # 3.046 GHz - 46,934,600,434 instructions # 2.41 insn per cycle - 6.394290908 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.971935 sec + 18,327,370,852 cycles # 3.067 GHz + 43,971,442,751 instructions # 2.40 insn per cycle + 5.977352348 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.659117e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.166540e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.166540e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.658250e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.168305e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.168305e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.181185 sec - 12,809,069,138 cycles # 3.060 GHz - 31,183,695,070 instructions # 2.43 insn per cycle - 4.187406583 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.196892 sec + 12,732,971,160 cycles # 3.031 GHz + 30,998,026,084 instructions # 2.43 insn per cycle + 4.202372987 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.119833e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.977991e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.977991e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.058430e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.883101e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.883101e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.357662 sec - 9,984,165,417 cycles # 2.969 GHz - 19,478,887,428 instructions # 1.95 insn per cycle - 3.363778767 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +TOTAL : 3.446922 sec + 10,145,804,321 cycles # 2.940 GHz + 19,366,948,979 instructions # 1.91 insn per cycle + 3.452452971 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.227801e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.199728e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.199728e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.138596e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.023243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.023243e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.214378 sec - 9,537,291,787 cycles # 2.963 GHz - 18,941,744,436 instructions # 1.99 insn per cycle - 3.220442066 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +TOTAL : 3.339653 sec + 9,693,126,342 cycles # 2.898 GHz + 18,976,550,822 instructions # 1.96 insn per cycle + 3.345442131 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.008509e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.773490e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.773490e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.879529e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.506982e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.506982e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.524415 sec - 8,196,437,380 cycles # 2.322 GHz - 15,510,956,697 instructions # 1.89 insn per cycle - 3.530544205 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +TOTAL : 3.741561 sec + 8,595,853,951 cycles # 2.295 GHz + 15,727,211,339 instructions # 1.83 insn per cycle + 3.747065146 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index f143b0d07e..4a4acadae4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,199 +13,199 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_22:01:32 +DATE: 2024-03-01_03:19:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.185742e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.554312e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.047778e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.223584e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.552038e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.038459e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.839986 sec - 6,325,475,989 cycles # 3.058 GHz - 11,573,800,558 instructions # 1.83 insn per cycle - 2.127438997 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +TOTAL : 1.841184 sec + 6,281,268,865 cycles # 3.032 GHz + 11,616,541,551 instructions # 1.85 insn per cycle + 2.127335919 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 Avg ME (F77/CUDA) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.062149e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.235118e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.235118e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.136861e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.332827e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.332827e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.314728 sec - 19,513,325,065 cycles # 3.088 GHz - 46,932,457,987 instructions # 2.41 insn per cycle - 6.321008534 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.931254 sec + 18,320,874,631 cycles # 3.087 GHz + 43,971,483,251 instructions # 2.40 insn per cycle + 5.936943481 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.678056e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.200158e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.200158e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.678735e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.191487e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.191487e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.138227 sec - 12,805,005,540 cycles # 3.091 GHz - 31,182,633,145 instructions # 2.44 insn per cycle - 4.144327243 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.142725 sec + 12,747,370,194 cycles # 3.074 GHz + 30,997,666,885 instructions # 2.43 insn per cycle + 4.148307465 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.103740e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.946386e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.946386e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.080045e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.910176e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.910176e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.380556 sec - 10,029,667,841 cycles # 2.964 GHz - 19,479,253,229 instructions # 1.94 insn per cycle - 3.387379331 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +TOTAL : 3.411600 sec + 10,085,079,136 cycles # 2.953 GHz + 19,364,558,625 instructions # 1.92 insn per cycle + 3.417084709 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.148197e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.084864e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.084864e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.138969e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.032835e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.032835e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.329151 sec - 9,564,427,384 cycles # 2.869 GHz - 18,942,155,298 instructions # 1.98 insn per cycle - 3.335198952 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +TOTAL : 3.338836 sec + 9,731,023,917 cycles # 2.911 GHz + 18,988,816,377 instructions # 1.95 insn per cycle + 3.344328310 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.991549e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.749556e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.749556e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.865281e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.489559e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.489559e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.557586 sec - 8,179,849,242 cycles # 2.297 GHz - 15,511,241,799 instructions # 1.90 insn per cycle - 3.563627106 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +TOTAL : 3.766791 sec + 8,586,243,314 cycles # 2.277 GHz + 15,726,194,960 instructions # 1.83 insn per cycle + 3.772300478 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 6a3f1ceed7..acaec4a100 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:03:44 +DATE: 2024-03-01_02:24:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.454088e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.310484e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.178306e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.477749e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.322801e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.215924e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.688204 sec - 2,788,161,286 cycles # 3.018 GHz - 4,313,930,533 instructions # 1.55 insn per cycle - 1.004609512 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +TOTAL : 0.699180 sec + 2,815,032,547 cycles # 3.020 GHz + 4,411,732,319 instructions # 1.57 insn per cycle + 1.012826906 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165206E-002 -Relative difference = 1.027708011645137e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.132282e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.329017e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.329017e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.177941e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.396494e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.396494e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.949222 sec - 18,388,946,227 cycles # 3.089 GHz - 44,715,744,739 instructions # 2.43 insn per cycle - 5.964000663 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 486) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.744811 sec + 17,454,360,700 cycles # 3.039 GHz + 41,822,159,126 instructions # 2.40 insn per cycle + 5.754685240 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868164921E-002 -Relative difference = 1.0277102294013186e-08 +Avg ME (F77/C++) = 1.2828039868164916E-002 +Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.733678e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.296638e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.296638e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.724349e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.269291e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.269291e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.018712 sec - 12,430,727,525 cycles # 3.089 GHz - 30,107,925,252 instructions # 2.42 insn per cycle - 4.041024093 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.046627 sec + 12,493,235,601 cycles # 3.083 GHz + 30,160,547,265 instructions # 2.41 insn per cycle + 4.067076512 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868164921E-002 -Relative difference = 1.0277102294013186e-08 +Avg ME (F77/C++) = 1.2828039868164916E-002 +Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.056780e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.883541e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.883541e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.121345e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.968992e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.968992e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.453635 sec - 10,178,670,600 cycles # 2.943 GHz - 19,115,449,328 instructions # 1.88 insn per cycle - 3.483429962 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) +TOTAL : 3.357760 sec + 9,927,136,910 cycles # 2.952 GHz + 19,096,793,241 instructions # 1.92 insn per cycle + 3.375474470 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.258262e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.271969e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.271969e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.204942e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.126738e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.126738e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.174579 sec - 9,422,090,303 cycles # 2.963 GHz - 18,488,534,290 instructions # 1.96 insn per cycle - 3.194716512 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1576) (512y: 159) (512z: 0) +TOTAL : 3.243150 sec + 9,616,213,299 cycles # 2.960 GHz + 18,757,748,925 instructions # 1.95 insn per cycle + 3.265371118 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.415665e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.623650e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.623650e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.914682e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.579340e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.579340e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.995764 sec - 7,229,078,869 cycles # 2.409 GHz - 13,863,533,911 instructions # 1.92 insn per cycle - 3.021066220 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 818) (512y: 57) (512z: 898) +TOTAL : 3.680994 sec + 8,464,459,891 cycles # 2.296 GHz + 15,603,182,673 instructions # 1.84 insn per cycle + 3.700542167 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index ace759a2cd..5e36a6ad1c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:43:48 +DATE: 2024-03-01_03:02:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.475865e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.604505e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.146221e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.482201e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.589772e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.144008e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.701054 sec - 2,730,306,219 cycles # 3.021 GHz - 4,260,051,145 instructions # 1.56 insn per cycle - 0.996046541 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +TOTAL : 0.677531 sec + 2,738,360,567 cycles # 3.010 GHz + 4,202,554,319 instructions # 1.53 insn per cycle + 0.971727419 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 Avg ME (F77/CUDA) = 1.2828039868165201E-002 Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.458258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.800445e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.800445e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.697362e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.176157e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.176157e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.703376 sec - 14,587,622,722 cycles # 3.099 GHz - 36,695,900,493 instructions # 2.52 insn per cycle - 4.709688335 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.107132 sec + 12,669,493,888 cycles # 3.081 GHz + 32,513,570,576 instructions # 2.57 insn per cycle + 4.112837024 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.110786e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.028102e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.028102e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.109105e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.012747e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.012747e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.372756 sec - 10,352,627,181 cycles # 3.065 GHz - 24,753,157,000 instructions # 2.39 insn per cycle - 3.378997727 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.385880 sec + 10,259,128,837 cycles # 3.025 GHz + 24,473,597,991 instructions # 2.39 insn per cycle + 3.391687112 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.404635e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.589571e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.589571e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.263099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.319180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.319180e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.006552 sec - 8,879,177,800 cycles # 2.948 GHz - 16,954,648,410 instructions # 1.91 insn per cycle - 3.013416619 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1604) (512y: 0) (512z: 0) +TOTAL : 3.179158 sec + 9,139,183,085 cycles # 2.870 GHz + 16,922,980,195 instructions # 1.85 insn per cycle + 3.185130704 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.558616e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.983086e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.983086e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.177097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.324804e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.324804e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.859667 sec - 8,322,312,903 cycles # 2.905 GHz - 16,297,913,029 instructions # 1.96 insn per cycle - 2.866233363 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2403) (512y: 292) (512z: 0) +TOTAL : 3.299126 sec + 9,225,486,663 cycles # 2.804 GHz + 16,350,529,622 instructions # 1.77 insn per cycle + 3.305119215 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.182801e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.123196e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.123196e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.061533e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.856351e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.856351e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.275855 sec - 7,741,160,272 cycles # 2.360 GHz - 14,352,612,145 instructions # 1.85 insn per cycle - 3.282280773 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 892) (512y: 63) (512z: 975) +TOTAL : 3.449960 sec + 7,914,148,444 cycles # 2.292 GHz + 14,582,993,732 instructions # 1.84 insn per cycle + 3.455623027 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 462a523e87..640cde8efe 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:44:19 +DATE: 2024-03-01_03:02:37 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.483285e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.624302e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.203253e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.480008e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.624168e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.202092e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.661912 sec - 2,759,005,929 cycles # 3.039 GHz - 4,269,523,592 instructions # 1.55 insn per cycle - 0.969846459 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +TOTAL : 0.676373 sec + 2,668,503,996 cycles # 2.929 GHz + 4,153,523,497 instructions # 1.56 insn per cycle + 0.971892133 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165206E-002 -Relative difference = 1.027708011645137e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.050399e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.792596e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.792596e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.254295e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.186891e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186891e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.454738 sec - 10,760,210,593 cycles # 3.110 GHz - 28,354,229,298 instructions # 2.64 insn per cycle - 3.460997731 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.188433 sec + 9,833,021,244 cycles # 3.080 GHz + 25,393,539,961 instructions # 2.58 insn per cycle + 3.194101979 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868164921E-002 -Relative difference = 1.0277102294013186e-08 +Avg ME (F77/C++) = 1.2828039868164916E-002 +Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.420212e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.670206e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.670206e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.515638e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.869932e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.869932e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.989459 sec - 9,269,380,262 cycles # 3.097 GHz - 21,587,653,666 instructions # 2.33 insn per cycle - 2.995960242 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.899703 sec + 8,920,893,128 cycles # 3.072 GHz + 21,482,466,118 instructions # 2.41 insn per cycle + 2.905533602 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868164921E-002 -Relative difference = 1.0277102294013186e-08 +Avg ME (F77/C++) = 1.2828039868164916E-002 +Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.611672e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.040459e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.040459e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.523191e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.858970e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.858970e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.798403 sec - 8,372,237,978 cycles # 2.987 GHz - 15,943,054,462 instructions # 1.90 insn per cycle - 2.804906138 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) +TOTAL : 2.882396 sec + 8,595,793,495 cycles # 2.978 GHz + 15,810,706,009 instructions # 1.84 insn per cycle + 2.888136564 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.809188e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.542047e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.542047e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.508044e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.828642e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.828642e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.625748 sec - 7,852,939,253 cycles # 2.985 GHz - 15,369,604,507 instructions # 1.96 insn per cycle - 2.631901573 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2179) (512y: 307) (512z: 0) +TOTAL : 2.905551 sec + 8,435,887,633 cycles # 2.898 GHz + 15,503,428,881 instructions # 1.84 insn per cycle + 2.911395780 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.255448e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.294259e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.294259e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.236518e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.188285e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.188285e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.188310 sec - 7,384,984,169 cycles # 2.313 GHz - 13,880,431,877 instructions # 1.88 insn per cycle - 3.194595387 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 853) (512y: 69) (512z: 905) +TOTAL : 3.208349 sec + 7,562,205,797 cycles # 2.353 GHz + 14,282,233,625 instructions # 1.89 insn per cycle + 3.214128577 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 8a5dca8407..4388b968c1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:04:17 +DATE: 2024-03-01_02:25:01 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.090292e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.087271e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.295454e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.096246e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.080730e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.278086e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.579975 sec - 2,414,557,687 cycles # 3.000 GHz - 3,747,446,339 instructions # 1.55 insn per cycle - 0.879481305 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +TOTAL : 0.584592 sec + 2,424,873,450 cycles # 2.992 GHz + 3,757,113,510 instructions # 1.55 insn per cycle + 0.891497126 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 Avg ME (F77/CUDA) = 1.2828112125134794E-002 Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.093315e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.289133e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.289133e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.144766e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.356973e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.356973e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.114541 sec - 18,579,851,378 cycles # 3.036 GHz - 47,045,898,593 instructions # 2.53 insn per cycle - 6.128370923 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.861200 sec + 17,835,681,737 cycles # 3.040 GHz + 43,512,863,183 instructions # 2.44 insn per cycle + 5.870178360 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.229543e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.408352e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.408352e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.374028e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.640654e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.640654e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.196404 sec - 9,243,413,602 cycles # 2.887 GHz - 22,093,191,316 instructions # 2.39 insn per cycle - 3.216612535 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.010180 sec + 9,264,818,102 cycles # 3.072 GHz + 21,907,230,972 instructions # 2.36 insn per cycle + 3.030108679 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.646884e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.120078e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.120078e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.583102e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.970498e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.970498e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.730676 sec - 8,172,441,197 cycles # 2.987 GHz - 15,624,936,554 instructions # 1.91 insn per cycle - 2.751599284 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 2.786671 sec + 8,293,439,755 cycles # 2.970 GHz + 15,591,050,714 instructions # 1.88 insn per cycle + 2.803351674 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.682429e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.266485e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.266485e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.519812e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.882018e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.882018e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.705183 sec - 7,880,109,299 cycles # 2.907 GHz - 15,297,032,323 instructions # 1.94 insn per cycle - 2.721809360 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +TOTAL : 2.857922 sec + 8,240,284,445 cycles # 2.878 GHz + 15,434,807,288 instructions # 1.87 insn per cycle + 2.873134335 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.780173e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.417878e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.417878e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.640401e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.080150e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.080150e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.614704 sec - 6,408,894,473 cycles # 2.446 GHz - 12,623,358,022 instructions # 1.97 insn per cycle - 2.635161616 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +TOTAL : 2.738177 sec + 6,634,758,903 cycles # 2.418 GHz + 12,863,535,626 instructions # 1.94 insn per cycle + 2.752418443 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 63369f8db6..5ebf98d844 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:55:27 +DATE: 2024-03-01_03:13:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.329739e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.607347e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.607347e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.291092e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.500878e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.500878e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.648968 sec - 5,746,677,815 cycles # 3.062 GHz - 10,387,478,908 instructions # 1.81 insn per cycle - 1.933666455 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +TOTAL : 1.664885 sec + 5,743,008,286 cycles # 3.032 GHz + 10,353,112,228 instructions # 1.80 insn per cycle + 1.950710268 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,151 +72,151 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 Avg ME (F77/CUDA) = 1.2828112125134794E-002 Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.099961e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.290947e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.290947e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.118079e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.318846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318846e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.174779 sec - 19,223,896,861 cycles # 3.111 GHz - 47,194,228,256 instructions # 2.45 insn per cycle - 6.182172402 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.094512 sec + 18,492,834,117 cycles # 3.035 GHz + 43,665,828,462 instructions # 2.36 insn per cycle + 6.100764200 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.285304e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.422206e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.422206e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.278046e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.410824e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.410824e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.222898 sec - 10,014,499,753 cycles # 3.102 GHz - 23,429,401,993 instructions # 2.34 insn per cycle - 3.229900393 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.242674 sec + 9,984,073,322 cycles # 3.074 GHz + 23,241,211,318 instructions # 2.33 insn per cycle + 3.248988906 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.519465e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.836911e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.836911e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.460715e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.687913e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.687913e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.979965 sec - 8,932,946,167 cycles # 2.992 GHz - 16,752,151,880 instructions # 1.88 insn per cycle - 2.986773953 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 3.031931 sec + 9,018,287,343 cycles # 2.969 GHz + 16,710,480,351 instructions # 1.85 insn per cycle + 3.038355322 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.602197e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.058006e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.058006e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.487042e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742069e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742069e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.895405 sec - 8,631,584,781 cycles # 2.975 GHz - 16,422,465,858 instructions # 1.90 insn per cycle - 2.902263166 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +TOTAL : 3.003313 sec + 8,924,279,581 cycles # 2.966 GHz + 16,553,851,203 instructions # 1.85 insn per cycle + 3.009721457 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.626828e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.049108e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.049108e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.456097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.675362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.675362e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.871620 sec - 7,175,875,520 cycles # 2.494 GHz - 13,849,689,792 instructions # 1.93 insn per cycle - 2.878604820 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +TOTAL : 3.047824 sec + 7,411,564,908 cycles # 2.428 GHz + 14,070,800,087 instructions # 1.90 insn per cycle + 3.054259465 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 7e45b462eb..57f3a9eb6a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_22:08:41 +DATE: 2024-03-01_03:26:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.307848e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.174799e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.254654e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.305418e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.176873e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.254170e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.148146 sec - 4,162,615,810 cycles # 3.046 GHz - 6,633,625,649 instructions # 1.59 insn per cycle - 1.425956052 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +TOTAL : 1.176348 sec + 4,160,459,328 cycles # 2.977 GHz + 6,608,736,714 instructions # 1.59 insn per cycle + 1.454481545 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 Avg ME (F77/CUDA) = 1.2828112125134794E-002 Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.110958e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.307650e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.307650e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.163258e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.379965e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.379965e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.341153 sec - 19,568,239,445 cycles # 3.084 GHz - 47,229,460,572 instructions # 2.41 insn per cycle - 6.347007883 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.084905 sec + 18,848,150,042 cycles # 3.095 GHz + 43,694,410,467 instructions # 2.32 insn per cycle + 6.090122961 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.331196e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.570674e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.570674e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.362188e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.607795e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.607795e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.377654 sec - 10,245,146,540 cycles # 3.029 GHz - 22,173,719,356 instructions # 2.16 insn per cycle - 3.383537599 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.340145 sec + 10,237,006,523 cycles # 3.061 GHz + 21,987,992,116 instructions # 2.15 insn per cycle + 3.345494687 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.607110e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.069423e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.069423e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.557177e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.937995e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.937995e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.090252 sec - 9,206,173,868 cycles # 2.975 GHz - 15,535,610,413 instructions # 1.69 insn per cycle - 3.096199232 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 3.130033 sec + 9,276,164,079 cycles # 2.959 GHz + 15,501,530,354 instructions # 1.67 insn per cycle + 3.135291294 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.726874e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.384844e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.384844e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.607828e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022471e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.022471e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.983648 sec - 8,939,068,101 cycles # 2.992 GHz - 15,006,420,771 instructions # 1.68 insn per cycle - 2.989393384 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +TOTAL : 3.090209 sec + 9,218,829,691 cycles # 2.980 GHz + 15,143,949,757 instructions # 1.64 insn per cycle + 3.095551418 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.761883e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.389409e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.389409e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.625698e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.049871e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.049871e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.961475 sec - 7,435,706,316 cycles # 2.509 GHz - 12,333,144,638 instructions # 1.66 insn per cycle - 2.967308412 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +TOTAL : 3.081111 sec + 7,633,670,846 cycles # 2.474 GHz + 12,572,894,419 instructions # 1.65 insn per cycle + 3.086406325 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 43d8f0743f..72f866059b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_22:05:23 +DATE: 2024-03-01_03:23:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.313608e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.194062e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.294541e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.312185e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188856e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.293387e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.837839 sec - 3,221,458,304 cycles # 3.025 GHz - 6,460,874,497 instructions # 2.01 insn per cycle - 1.122134673 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +TOTAL : 0.840959 sec + 3,233,651,545 cycles # 3.031 GHz + 6,593,293,750 instructions # 2.04 insn per cycle + 1.123835132 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 Avg ME (F77/CUDA) = 1.2828112125134794E-002 Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.114925e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.312637e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.312637e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.165423e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.380976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.380976e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.995075 sec - 18,550,724,282 cycles # 3.092 GHz - 47,045,610,283 instructions # 2.54 insn per cycle - 6.000876182 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.753852 sec + 17,814,734,742 cycles # 3.094 GHz + 43,512,567,450 instructions # 2.44 insn per cycle + 5.759197636 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.328720e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.561231e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.561231e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.367425e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.644557e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.644557e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.060634 sec - 9,255,866,935 cycles # 3.027 GHz - 22,095,165,932 instructions # 2.39 insn per cycle - 3.066453520 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.015821 sec + 9,302,641,553 cycles # 3.081 GHz + 21,907,397,717 instructions # 2.35 insn per cycle + 3.021054890 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.644282e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.130925e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.130925e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.605570e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.994881e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.994881e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.731302 sec - 8,179,125,857 cycles # 2.990 GHz - 15,624,271,395 instructions # 1.91 insn per cycle - 2.737082415 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 2.763364 sec + 8,259,626,138 cycles # 2.984 GHz + 15,589,955,941 instructions # 1.89 insn per cycle + 2.768827600 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.755995e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.398064e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.398064e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.581356e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.971929e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.971929e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.631869 sec - 7,862,262,943 cycles # 2.982 GHz - 15,296,823,766 instructions # 1.95 insn per cycle - 2.637847648 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +TOTAL : 2.794808 sec + 8,189,932,997 cycles # 2.926 GHz + 15,434,468,382 instructions # 1.88 insn per cycle + 2.800117026 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.790427e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.441377e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.441377e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.644746e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.098711e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.098711e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.608833 sec - 6,396,610,736 cycles # 2.448 GHz - 12,622,932,331 instructions # 1.97 insn per cycle - 2.614856919 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +TOTAL : 2.731125 sec + 6,642,886,027 cycles # 2.429 GHz + 12,862,690,732 instructions # 1.94 insn per cycle + 2.736362886 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index eccc4446a6..8d8716bc9a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,199 +13,199 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_22:02:07 +DATE: 2024-03-01_03:20:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.242306e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.165230e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.195778e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.282885e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.142631e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.141870e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.444358 sec - 5,084,597,164 cycles # 3.054 GHz - 9,275,960,840 instructions # 1.82 insn per cycle - 1.722136335 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +TOTAL : 1.452761 sec + 5,067,036,613 cycles # 3.030 GHz + 9,262,361,364 instructions # 1.83 insn per cycle + 1.731002061 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 Avg ME (F77/CUDA) = 1.2828112125134794E-002 Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.115036e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.313739e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.313739e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.160324e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.375621e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.375621e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.993942 sec - 18,570,897,233 cycles # 3.097 GHz - 47,045,577,349 instructions # 2.53 insn per cycle - 5.999677453 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.780149 sec + 17,815,433,670 cycles # 3.080 GHz + 43,511,102,764 instructions # 2.44 insn per cycle + 5.785180938 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.400769e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.663094e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.663094e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.389771e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.650423e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.650423e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.965370 sec - 9,232,101,571 cycles # 3.108 GHz - 22,091,285,431 instructions # 2.39 insn per cycle - 2.971262386 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.992624 sec + 9,227,327,267 cycles # 3.079 GHz + 21,906,426,544 instructions # 2.37 insn per cycle + 2.997895192 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.574500e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.992207e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.992207e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.528530e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.865855e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.865855e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.803871 sec - 8,156,775,877 cycles # 2.904 GHz - 15,624,194,262 instructions # 1.92 insn per cycle - 2.809755883 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 2.845512 sec + 8,254,984,848 cycles # 2.896 GHz + 15,590,498,904 instructions # 1.89 insn per cycle + 2.850900280 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.764006e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.416109e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.416109e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.609279e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.018312e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018312e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.625956 sec - 7,852,454,396 cycles # 2.985 GHz - 15,295,794,447 instructions # 1.95 insn per cycle - 2.631853276 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +TOTAL : 2.764714 sec + 8,215,374,590 cycles # 2.969 GHz + 15,429,066,515 instructions # 1.88 insn per cycle + 2.770036927 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.751259e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.355715e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.355715e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.648656e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.090784e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.090784e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.642188 sec - 6,396,687,295 cycles # 2.417 GHz - 12,623,285,928 instructions # 1.97 insn per cycle - 2.647901833 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +TOTAL : 2.731162 sec + 6,615,238,340 cycles # 2.419 GHz + 12,862,797,254 instructions # 1.94 insn per cycle + 2.736410000 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index e0c72c5e2b..f9e4000e6d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:04:47 +DATE: 2024-03-01_02:25:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.091038e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.089941e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.319291e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.096943e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095054e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.337200e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.579294 sec - 2,335,364,905 cycles # 2.891 GHz - 3,618,466,633 instructions # 1.55 insn per cycle - 0.885152629 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +TOTAL : 0.581297 sec + 2,416,875,461 cycles # 3.000 GHz + 3,802,904,431 instructions # 1.57 insn per cycle + 0.886522859 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 Avg ME (F77/CUDA) = 1.2828112125134794E-002 Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.170365e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.389464e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.389464e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.237656e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.486670e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.486670e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.728515 sec - 17,727,054,378 cycles # 3.093 GHz - 43,885,619,946 instructions # 2.48 insn per cycle - 5.742377896 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 467) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.444566 sec + 16,726,225,777 cycles # 3.070 GHz + 41,270,625,621 instructions # 2.47 insn per cycle + 5.454849598 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.388452e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.722234e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.722234e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.460514e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.827007e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.827007e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.990798 sec - 9,083,778,420 cycles # 3.032 GHz - 21,582,115,658 instructions # 2.38 insn per cycle - 3.009778662 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.914617 sec + 8,996,783,237 cycles # 3.081 GHz + 21,210,998,059 instructions # 2.36 insn per cycle + 2.929493898 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.653242e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.141586e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.141586e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.611163e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022551e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.022551e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.723711 sec - 8,113,273,564 cycles # 2.973 GHz - 15,429,717,708 instructions # 1.90 insn per cycle - 2.745601764 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2542) (512y: 0) (512z: 0) +TOTAL : 2.760181 sec + 8,249,336,928 cycles # 2.983 GHz + 15,425,238,678 instructions # 1.87 insn per cycle + 2.778856529 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.767282e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.438840e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.438840e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.587140e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.018405e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018405e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.623612 sec - 7,842,213,849 cycles # 2.983 GHz - 15,086,650,215 instructions # 1.92 insn per cycle - 2.642651290 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2323) (512y: 15) (512z: 0) +TOTAL : 2.789811 sec + 8,096,556,575 cycles # 2.897 GHz + 15,238,891,903 instructions # 1.88 insn per cycle + 2.804859872 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.853427e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.667539e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.667539e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.644016e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.094854e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.094854e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.566015 sec - 6,180,775,263 cycles # 2.404 GHz - 12,245,115,281 instructions # 1.98 insn per cycle - 2.584349048 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1538) (512y: 8) (512z: 1258) +TOTAL : 2.735992 sec + 6,623,617,660 cycles # 2.417 GHz + 12,843,079,376 instructions # 1.94 insn per cycle + 2.752411310 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052431359538E-002 -Relative difference = 1.895346165094282e-07 +Avg ME (F77/C++) = 1.2828052564145764E-002 +Relative difference = 1.9988585667912256e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 2ca4079866..fde060de72 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:44:47 +DATE: 2024-03-01_03:03:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.294311e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.188610e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.298765e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.224284e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181869e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.290244e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.558468 sec - 2,361,222,111 cycles # 3.020 GHz - 3,664,682,465 instructions # 1.55 insn per cycle - 0.839090549 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +TOTAL : 0.576138 sec + 2,415,755,755 cycles # 3.001 GHz + 3,734,378,655 instructions # 1.55 insn per cycle + 0.864225849 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 Avg ME (F77/CUDA) = 1.2828112125134794E-002 Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.515808e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.899432e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.899432e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.727035e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.251286e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.251286e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.497522 sec - 13,733,531,110 cycles # 3.050 GHz - 37,847,942,431 instructions # 2.76 insn per cycle - 4.503720248 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.002640 sec + 12,159,409,273 cycles # 3.035 GHz + 32,432,694,101 instructions # 2.67 insn per cycle + 4.008158303 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039414671366E-002 -Relative difference = 4.562884388571957e-08 +Avg ME (F77/C++) = 1.2828039840314887E-002 +Relative difference = 1.244813035273009e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.870371e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.896075e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.896075e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.805511e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.765564e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.765564e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.543054 sec - 7,908,771,565 cycles # 3.104 GHz - 18,602,738,348 instructions # 2.35 insn per cycle - 2.549080007 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.601867 sec + 7,999,882,010 cycles # 3.069 GHz + 18,656,600,340 instructions # 2.33 insn per cycle + 2.607493343 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039283704129E-002 +Relative difference = 5.583829420356249e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.917953e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.876983e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.876983e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.939924e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.842069e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.842069e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.514474 sec - 7,422,980,759 cycles # 2.947 GHz - 14,339,966,909 instructions # 1.93 insn per cycle - 2.520873084 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 0) (512z: 0) +TOTAL : 2.492780 sec + 7,427,313,914 cycles # 2.974 GHz + 14,251,086,474 instructions # 1.92 insn per cycle + 2.498394316 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053246266791E-002 -Relative difference = 2.5306003563303186e-07 +Avg ME (F77/C++) = 1.2828053244447801E-002 +Relative difference = 2.5291823782248813e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.019478e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.132034e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.132034e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.004272e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.034488e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.034488e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.432856 sec - 7,272,187,783 cycles # 2.983 GHz - 13,954,061,773 instructions # 1.92 insn per cycle - 2.439104287 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3875) (512y: 9) (512z: 0) +TOTAL : 2.444620 sec + 7,299,238,549 cycles # 2.980 GHz + 13,947,633,533 instructions # 1.91 insn per cycle + 2.450212772 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053277189611E-002 -Relative difference = 2.5547059841227576e-07 +Avg ME (F77/C++) = 1.2828053244447801E-002 +Relative difference = 2.5291823782248813e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.695630e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.321192e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.321192e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.706121e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.223606e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.223606e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.703955 sec - 6,276,486,435 cycles # 2.319 GHz - 13,210,471,023 instructions # 2.10 insn per cycle - 2.710431861 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1734) (512y: 3) (512z: 1266) +TOTAL : 2.681955 sec + 6,492,318,128 cycles # 2.417 GHz + 13,422,094,611 instructions # 2.07 insn per cycle + 2.687432186 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052540498902E-002 -Relative difference = 1.980424851420537e-07 +Avg ME (F77/C++) = 1.2828052562326775E-002 +Relative difference = 1.997440588685788e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index b4c719017f..0d6d3b3db1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:45:14 +DATE: 2024-03-01_03:03:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.298701e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198119e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.328440e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.215876e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204111e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.337047e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.562549 sec - 2,357,901,564 cycles # 3.003 GHz - 3,618,401,654 instructions # 1.53 insn per cycle - 0.844509832 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +TOTAL : 0.576922 sec + 2,404,705,116 cycles # 2.985 GHz + 3,758,296,111 instructions # 1.56 insn per cycle + 0.864210592 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 Avg ME (F77/CUDA) = 1.2828112125134794E-002 Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.119202e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.962243e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.962243e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.296714e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.359904e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.359904e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.318683 sec - 10,107,573,477 cycles # 3.041 GHz - 28,399,137,457 instructions # 2.81 insn per cycle - 3.324720314 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.097656 sec + 9,472,450,742 cycles # 3.053 GHz + 25,268,175,697 instructions # 2.67 insn per cycle + 3.103042436 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039838495897E-002 +Relative difference = 1.2589928273811243e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.147008e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.786097e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.786097e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.079795e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.704088e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.704088e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.346802 sec - 7,263,828,292 cycles # 3.089 GHz - 16,785,829,907 instructions # 2.31 insn per cycle - 2.352943440 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.415041 sec + 7,164,638,851 cycles # 2.961 GHz + 16,869,197,703 instructions # 2.35 insn per cycle + 2.420723497 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.123540e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.405304e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.405304e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.078168e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.319472e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.319472e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.369038 sec - 7,067,469,253 cycles # 2.977 GHz - 13,728,904,699 instructions # 1.94 insn per cycle - 2.375173059 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2082) (512y: 0) (512z: 0) +TOTAL : 2.394138 sec + 7,165,321,711 cycles # 2.987 GHz + 13,616,190,038 instructions # 1.90 insn per cycle + 2.399577311 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053198973066E-002 -Relative difference = 2.4937329255889414e-07 +Avg ME (F77/C++) = 1.2828053220800939E-002 +Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.164074e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.525165e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.525165e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.136069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.411751e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.411751e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.342246 sec - 7,005,059,344 cycles # 2.987 GHz - 13,461,252,232 instructions # 1.92 insn per cycle - 2.348361461 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3649) (512y: 12) (512z: 0) +TOTAL : 2.363661 sec + 7,031,964,685 cycles # 2.970 GHz + 13,425,613,371 instructions # 1.91 insn per cycle + 2.369281481 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053198973066E-002 -Relative difference = 2.4937329255889414e-07 +Avg ME (F77/C++) = 1.2828053220800939E-002 +Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.031554e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.093489e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.093489e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.811199e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.477443e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.477443e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.432548 sec - 6,052,738,178 cycles # 2.484 GHz - 12,911,325,567 instructions # 2.13 insn per cycle - 2.438724916 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1671) (512y: 3) (512z: 1155) +TOTAL : 2.592425 sec + 6,321,858,831 cycles # 2.434 GHz + 13,153,560,775 instructions # 2.08 insn per cycle + 2.597985755 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052431359538E-002 -Relative difference = 1.895346165094282e-07 +Avg ME (F77/C++) = 1.2828052536860923E-002 +Relative difference = 1.977588895209662e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index be425be4c7..4be3e76490 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,194 +13,194 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:05:17 +DATE: 2024-03-01_02:26:01 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.442335e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.270525e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.142227e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.449419e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.301374e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.190967e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.694348 sec - 2,778,759,265 cycles # 2.987 GHz - 4,351,769,973 instructions # 1.57 insn per cycle - 1.013538948 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +TOTAL : 0.717219 sec + 2,841,227,385 cycles # 2.957 GHz + 4,430,504,412 instructions # 1.56 insn per cycle + 1.049815549 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 Avg ME (F77/CUDA) = 1.2828039901590279E-002 Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.053043e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.221133e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.221133e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.109294e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.297854e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.297854e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.363891 sec - 19,667,437,924 cycles # 3.088 GHz - 46,970,593,915 instructions # 2.39 insn per cycle - 6.379400508 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.069129 sec + 18,728,354,553 cycles # 3.083 GHz + 44,224,513,518 instructions # 2.36 insn per cycle + 6.079869673 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.674094e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.206402e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.206402e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.745615e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.315952e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.315952e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.154494 sec - 12,528,254,523 cycles # 3.011 GHz - 30,922,253,447 instructions # 2.47 insn per cycle - 4.173218933 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.001256 sec + 12,323,242,096 cycles # 3.075 GHz + 30,917,838,115 instructions # 2.51 insn per cycle + 4.017904894 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.054042e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.864807e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.864807e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.078908e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.902249e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.902249e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.453659 sec - 10,215,405,512 cycles # 2.953 GHz - 19,546,791,186 instructions # 1.91 insn per cycle - 3.473149540 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2119) (512y: 0) (512z: 0) +TOTAL : 3.416443 sec + 10,120,877,504 cycles # 2.958 GHz + 19,374,733,180 instructions # 1.91 insn per cycle + 3.431641491 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.158084e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.088743e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.088743e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.114347e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.979731e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.979731e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.309666 sec - 9,731,831,976 cycles # 2.936 GHz - 18,859,355,725 instructions # 1.94 insn per cycle - 3.328443471 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1850) (512y: 174) (512z: 0) +TOTAL : 3.374976 sec + 9,706,052,635 cycles # 2.871 GHz + 18,944,519,271 instructions # 1.95 insn per cycle + 3.395274500 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.025709e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.816813e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.816813e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.874531e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.524823e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.524823e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.498711 sec - 8,174,773,168 cycles # 2.333 GHz - 14,813,296,851 instructions # 1.81 insn per cycle - 3.518454852 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 64) (512z: 1327) +TOTAL : 3.760847 sec + 8,409,257,244 cycles # 2.233 GHz + 15,057,436,319 instructions # 1.79 insn per cycle + 3.776930410 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 016b4dcfb3..77001f8935 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,194 +13,194 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-02-05_21:05:51 +DATE: 2024-03-01_02:26:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.442557e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.276977e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.115971e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.443987e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.284127e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.143740e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.691949 sec - 2,792,230,252 cycles # 3.012 GHz - 4,406,954,821 instructions # 1.58 insn per cycle - 1.006155048 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +TOTAL : 0.699538 sec + 2,805,342,043 cycles # 2.999 GHz + 4,414,010,673 instructions # 1.57 insn per cycle + 1.020206687 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 Avg ME (F77/CUDA) = 1.2828039901590279E-002 Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.115199e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.308123e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.308123e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.155620e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.358194e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.358194e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.039453 sec - 18,520,409,160 cycles # 3.065 GHz - 44,592,637,919 instructions # 2.41 insn per cycle - 6.051596986 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 498) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.837265 sec + 18,090,198,997 cycles # 3.097 GHz + 42,472,863,850 instructions # 2.35 insn per cycle + 5.848007644 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.733167e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.307863e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.307863e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.786116e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.385279e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.385279e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.027291 sec - 12,198,841,388 cycles # 3.025 GHz - 30,216,598,772 instructions # 2.48 insn per cycle - 4.046858847 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.920672 sec + 12,137,736,337 cycles # 3.092 GHz + 30,225,042,392 instructions # 2.49 insn per cycle + 3.938311189 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.049138e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.860594e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.860594e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.068049e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.882124e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.882124e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.461039 sec - 10,275,869,327 cycles # 2.964 GHz - 19,037,482,995 instructions # 1.85 insn per cycle - 3.487028160 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2072) (512y: 0) (512z: 0) +TOTAL : 3.437770 sec + 10,015,371,277 cycles # 2.909 GHz + 19,256,811,213 instructions # 1.92 insn per cycle + 3.454377757 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.220471e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.207666e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.207666e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.207913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.137874e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.137874e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.222554 sec - 9,576,129,417 cycles # 2.966 GHz - 18,451,864,640 instructions # 1.93 insn per cycle - 3.247543229 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1775) (512y: 174) (512z: 0) +TOTAL : 3.235635 sec + 9,645,810,411 cycles # 2.976 GHz + 18,756,051,671 instructions # 1.94 insn per cycle + 3.251774736 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.403087e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.559501e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.559501e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.969792e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.680976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.680976e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.009542 sec - 7,207,687,295 cycles # 2.391 GHz - 13,242,362,983 instructions # 1.84 insn per cycle - 3.028852409 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 911) (512y: 56) (512z: 993) +TOTAL : 3.592139 sec + 8,293,535,644 cycles # 2.305 GHz + 14,979,176,568 instructions # 1.81 insn per cycle + 3.613399615 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 5df3cfc728..9a5df19d5b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,194 +13,194 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:06:24 +DATE: 2024-03-01_02:27:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.031380e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.141335e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.278634e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.025930e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.135524e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271935e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.533981 sec - 2,292,597,871 cycles # 2.985 GHz - 3,192,077,403 instructions # 1.39 insn per cycle - 0.845567304 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +TOTAL : 0.535145 sec + 2,303,454,226 cycles # 2.990 GHz + 3,249,200,622 instructions # 1.41 insn per cycle + 0.848848936 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 Avg ME (F77/CUDA) = 2.0288063388516822 Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.167153e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.231469e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.231469e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.185653e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.250591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.250591e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.944054 sec - 14,988,837,417 cycles # 3.029 GHz - 38,722,101,845 instructions # 2.58 insn per cycle - 4.959702244 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.903669 sec + 15,175,795,116 cycles # 3.093 GHz + 38,374,949,840 instructions # 2.53 insn per cycle + 4.917105673 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.673555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.879473e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.879473e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.662249e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.860778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.860778e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.960626 sec - 8,958,008,294 cycles # 3.020 GHz - 24,429,572,303 instructions # 2.73 insn per cycle - 2.978621858 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.968890 sec + 9,101,848,873 cycles # 3.060 GHz + 24,578,505,710 instructions # 2.70 insn per cycle + 2.986159008 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.882622e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.400822e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.400822e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.728560e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.222175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.222175e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.887751 sec - 5,543,029,143 cycles # 2.927 GHz - 11,561,920,461 instructions # 2.09 insn per cycle - 1.907306147 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +TOTAL : 1.936093 sec + 5,474,671,571 cycles # 2.819 GHz + 11,252,385,098 instructions # 2.06 insn per cycle + 1.954008279 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.804787e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.518217e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.518217e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.292169e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.895497e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.895497e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.646661 sec - 4,830,901,508 cycles # 2.924 GHz - 10,339,492,038 instructions # 2.14 insn per cycle - 1.663574159 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +TOTAL : 1.774092 sec + 4,972,729,611 cycles # 2.794 GHz + 10,557,445,760 instructions # 2.12 insn per cycle + 1.789622209 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.510361e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.803489e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.803489e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.894024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.109310e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109310e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.431325 sec - 4,950,210,545 cycles # 2.032 GHz - 7,554,605,170 instructions # 1.53 insn per cycle - 2.448514603 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +TOTAL : 2.799185 sec + 5,395,066,029 cycles # 1.924 GHz + 7,793,871,634 instructions # 1.44 insn per cycle + 2.817161041 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 3b9c251b66..598396a8e7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:56:00 +DATE: 2024-03-01_03:14:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.584968e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.894866e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.894866e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.569533e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.877038e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.877038e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.799738 sec - 3,165,879,197 cycles # 3.015 GHz - 4,878,208,543 instructions # 1.54 insn per cycle - 1.107477855 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +TOTAL : 0.801549 sec + 3,157,604,220 cycles # 3.025 GHz + 4,827,294,021 instructions # 1.53 insn per cycle + 1.101037847 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,148 +72,148 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 Avg ME (F77/CUDA) = 2.0288063388516822 Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.203709e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.271501e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.271501e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.171920e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.234476e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.234476e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.942943 sec - 15,313,610,911 cycles # 3.094 GHz - 38,782,458,170 instructions # 2.53 insn per cycle - 4.950650950 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.008942 sec + 15,497,351,856 cycles # 3.090 GHz + 38,433,512,801 instructions # 2.48 insn per cycle + 5.015755142 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.720233e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.927980e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.927980e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.610749e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.808660e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.808660e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.000337 sec - 9,301,358,484 cycles # 3.094 GHz - 24,611,951,458 instructions # 2.65 insn per cycle - 3.007950946 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.090616 sec + 9,430,020,802 cycles # 3.049 GHz + 24,763,068,407 instructions # 2.63 insn per cycle + 3.097621879 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.770866e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.269781e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.269781e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.825746e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.328246e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.328246e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.001192 sec - 5,886,505,539 cycles # 2.932 GHz - 11,848,414,311 instructions # 2.01 insn per cycle - 2.008838193 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +TOTAL : 1.984017 sec + 5,826,620,771 cycles # 2.928 GHz + 11,538,062,844 instructions # 1.98 insn per cycle + 1.990946794 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.354252e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.965861e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.965861e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.484023e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.101551e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.101551e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.834602 sec - 5,170,533,445 cycles # 2.808 GHz - 10,625,432,779 instructions # 2.05 insn per cycle - 1.842105594 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +TOTAL : 1.799262 sec + 5,294,562,816 cycles # 2.933 GHz + 10,843,404,980 instructions # 2.05 insn per cycle + 1.806082483 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.444731e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.731238e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.731238e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.045937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.276782e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.276782e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.545615 sec - 5,312,567,215 cycles # 2.082 GHz - 7,798,505,404 instructions # 1.47 insn per cycle - 2.553150085 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +TOTAL : 2.778138 sec + 5,743,518,580 cycles # 2.063 GHz + 8,037,207,687 instructions # 1.40 insn per cycle + 2.785184310 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index e26aca600d..977053e874 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,194 +13,194 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_22:09:13 +DATE: 2024-03-01_03:27:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.584208e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.160059e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.278576e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.571348e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154956e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272098e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.609246 sec - 2,581,101,746 cycles # 3.036 GHz - 3,713,117,006 instructions # 1.44 insn per cycle - 0.907852678 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +TOTAL : 0.617245 sec + 2,532,813,012 cycles # 2.999 GHz + 3,701,870,616 instructions # 1.46 insn per cycle + 0.904006340 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 Avg ME (F77/CUDA) = 2.0288063388516822 Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.227475e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.294111e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.294111e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.183394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.247420e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.247420e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.874182 sec - 15,158,379,002 cycles # 3.108 GHz - 38,738,551,190 instructions # 2.56 insn per cycle - 4.880819504 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.966854 sec + 15,343,121,883 cycles # 3.087 GHz + 38,390,661,623 instructions # 2.50 insn per cycle + 4.972403311 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.767694e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.983118e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.983118e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.599283e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.796561e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.796561e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.945749 sec - 9,142,701,773 cycles # 3.098 GHz - 24,427,627,687 instructions # 2.67 insn per cycle - 2.951884209 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.079495 sec + 9,279,730,828 cycles # 3.010 GHz + 24,577,932,954 instructions # 2.65 insn per cycle + 3.085060857 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.805965e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.338850e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.338850e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.908259e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.435116e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.435116e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.974104 sec - 5,727,022,204 cycles # 2.896 GHz - 11,544,923,636 instructions # 2.02 insn per cycle - 1.980353692 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +TOTAL : 1.937503 sec + 5,654,473,993 cycles # 2.911 GHz + 11,233,989,199 instructions # 1.99 insn per cycle + 1.943141738 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.756149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.446891e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.446891e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.578665e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.217153e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.217153e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.717912 sec - 5,004,707,628 cycles # 2.905 GHz - 10,287,500,091 instructions # 2.06 insn per cycle - 1.724204003 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +TOTAL : 1.757396 sec + 5,128,637,723 cycles # 2.910 GHz + 10,505,547,256 instructions # 2.05 insn per cycle + 1.762900213 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.529833e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.832701e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.832701e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.070979e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.306684e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.306684e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.483819 sec - 5,134,216,348 cycles # 2.063 GHz - 7,502,488,127 instructions # 1.46 insn per cycle - 2.490239423 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +TOTAL : 2.739915 sec + 5,558,468,681 cycles # 2.025 GHz + 7,741,606,815 instructions # 1.39 insn per cycle + 2.745378653 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index b2a67c7d91..29a670398e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,194 +13,194 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_22:05:53 +DATE: 2024-03-01_03:24:00 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.574516e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156692e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271930e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.579097e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155655e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270242e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.550400 sec - 2,362,670,466 cycles # 3.032 GHz - 3,700,412,120 instructions # 1.57 insn per cycle - 0.836987659 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +TOTAL : 0.554530 sec + 2,358,271,315 cycles # 3.013 GHz + 3,682,090,929 instructions # 1.56 insn per cycle + 0.840283729 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 Avg ME (F77/CUDA) = 2.0288063388516822 Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.199981e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.265430e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.265430e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.177843e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.241689e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.241689e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.873281 sec - 14,976,294,919 cycles # 3.071 GHz - 38,723,494,875 instructions # 2.59 insn per cycle - 4.880112383 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.919493 sec + 15,156,700,875 cycles # 3.078 GHz + 38,373,397,442 instructions # 2.53 insn per cycle + 4.925048190 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.720404e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.926714e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.926714e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.588081e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.785746e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.785746e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.922753 sec - 8,955,117,292 cycles # 3.059 GHz - 24,429,239,776 instructions # 2.73 insn per cycle - 2.929117772 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.028765 sec + 9,114,596,397 cycles # 3.011 GHz + 24,581,732,536 instructions # 2.70 insn per cycle + 3.034354491 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.848139e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.358480e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.358480e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.938829e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.476997e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.476997e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.898353 sec - 5,532,392,582 cycles # 2.906 GHz - 11,561,358,692 instructions # 2.09 insn per cycle - 1.904642376 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +TOTAL : 1.869884 sec + 5,467,539,853 cycles # 2.917 GHz + 11,251,237,475 instructions # 2.06 insn per cycle + 1.875504692 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.810691e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.505130e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.505130e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.273575e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.896545e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.896545e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.645115 sec - 4,822,133,768 cycles # 2.922 GHz - 10,338,425,342 instructions # 2.14 insn per cycle - 1.651426752 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +TOTAL : 1.780053 sec + 4,944,261,583 cycles # 2.770 GHz + 10,558,833,446 instructions # 2.14 insn per cycle + 1.785881884 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.293803e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.567203e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.567203e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.090701e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.328087e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.328087e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.549026 sec - 4,939,812,418 cycles # 1.934 GHz - 7,554,058,853 instructions # 1.53 insn per cycle - 2.555210030 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +TOTAL : 2.667720 sec + 5,371,754,599 cycles # 2.010 GHz + 7,792,372,952 instructions # 1.45 insn per cycle + 2.673339648 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index fc219074a0..e5cfc13b3e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,196 +13,196 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_22:02:38 +DATE: 2024-03-01_03:20:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.974189e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155593e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271531e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.972409e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155179e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272541e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.695991 sec - 2,799,036,357 cycles # 3.024 GHz - 4,374,236,566 instructions # 1.56 insn per cycle - 0.983348330 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +TOTAL : 0.697938 sec + 2,798,675,219 cycles # 3.021 GHz + 4,376,672,842 instructions # 1.56 insn per cycle + 0.983897382 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 Avg ME (F77/CUDA) = 2.0288063388516822 Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.222419e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.288906e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.288906e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.189575e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.254386e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.254386e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.824134 sec - 14,966,646,991 cycles # 3.100 GHz - 38,721,963,975 instructions # 2.59 insn per cycle - 4.830722708 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.893907 sec + 15,162,024,600 cycles # 3.096 GHz + 38,372,989,497 instructions # 2.53 insn per cycle + 4.899450957 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.779967e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.992813e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.992813e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.704548e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.907149e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.907149e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.878539 sec - 8,952,214,854 cycles # 3.104 GHz - 24,428,365,936 instructions # 2.73 insn per cycle - 2.884761942 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.935182 sec + 9,091,941,153 cycles # 3.094 GHz + 24,577,519,112 instructions # 2.70 insn per cycle + 2.940777194 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.891897e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.405679e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.405679e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.938740e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.466662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.466662e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.885534 sec - 5,523,928,525 cycles # 2.924 GHz - 11,561,426,307 instructions # 2.09 insn per cycle - 1.891796100 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +TOTAL : 1.869802 sec + 5,458,289,042 cycles # 2.911 GHz + 11,250,961,339 instructions # 2.06 insn per cycle + 1.875881825 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.812282e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.507614e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.507614e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.493369e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.117845e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.117845e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.645132 sec - 4,825,784,737 cycles # 2.924 GHz - 10,338,194,804 instructions # 2.14 insn per cycle - 1.651332428 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +TOTAL : 1.719311 sec + 5,034,836,824 cycles # 2.920 GHz + 10,558,271,294 instructions # 2.10 insn per cycle + 1.725057980 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.459424e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.748576e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.748576e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.013824e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.247297e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.247297e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.457548 sec - 4,961,918,439 cycles # 2.015 GHz - 7,553,551,302 instructions # 1.52 insn per cycle - 2.463770367 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +TOTAL : 2.716839 sec + 5,403,556,568 cycles # 1.987 GHz + 7,794,191,095 instructions # 1.44 insn per cycle + 2.722528243 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 64a6ffae37..73356b00dd 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,194 +13,194 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:06:51 +DATE: 2024-03-01_02:27:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.024315e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.137212e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273722e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.058566e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.139903e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277694e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.527459 sec - 2,303,911,509 cycles # 3.021 GHz - 3,282,093,306 instructions # 1.42 insn per cycle - 0.841396629 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +TOTAL : 0.538743 sec + 2,297,794,086 cycles # 2.963 GHz + 3,276,125,304 instructions # 1.43 insn per cycle + 0.856267333 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 Avg ME (F77/CUDA) = 2.0288063388516822 Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.269898e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.339915e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.339915e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.197217e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.262307e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.262307e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.724475 sec - 14,695,477,691 cycles # 3.107 GHz - 39,544,600,287 instructions # 2.69 insn per cycle - 4.738338098 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.877526 sec + 15,081,677,651 cycles # 3.089 GHz + 40,100,660,385 instructions # 2.66 insn per cycle + 4.889980594 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.819317e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.044125e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.044125e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.910252e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.135599e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.135599e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.855662 sec - 8,611,495,269 cycles # 3.013 GHz - 23,577,603,947 instructions # 2.74 insn per cycle - 2.876870171 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1948) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.787478 sec + 8,606,981,244 cycles # 3.082 GHz + 23,670,854,000 instructions # 2.75 insn per cycle + 2.801213189 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.386792e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.829860e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.829860e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.287623e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.696089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.696089e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.054853 sec - 5,976,472,878 cycles # 2.900 GHz - 13,192,591,343 instructions # 2.21 insn per cycle - 2.073701205 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2560) (512y: 0) (512z: 0) +TOTAL : 2.088271 sec + 6,101,163,180 cycles # 2.915 GHz + 13,060,965,379 instructions # 2.14 insn per cycle + 2.110411764 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.863746e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.385624e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.385624e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.510708e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.955656e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.955656e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.894056 sec - 5,556,974,582 cycles # 2.925 GHz - 12,102,123,600 instructions # 2.18 insn per cycle - 1.916239054 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2030) (512y: 278) (512z: 0) +TOTAL : 2.007458 sec + 5,795,313,103 cycles # 2.878 GHz + 12,320,114,352 instructions # 2.13 insn per cycle + 2.035740422 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.164130e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.414147e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.414147e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.559784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.746127e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.746127e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.622660 sec - 5,367,738,607 cycles # 2.042 GHz - 9,380,734,862 instructions # 1.75 insn per cycle - 2.639729471 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1350) (512y: 88) (512z: 1989) +TOTAL : 3.054998 sec + 5,836,990,709 cycles # 1.908 GHz + 9,601,704,067 instructions # 1.64 insn per cycle + 3.069883688 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 7e05113160..7ca7ca6f27 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,194 +13,194 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:45:40 +DATE: 2024-03-01_03:03:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.570294e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.157680e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272948e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.566149e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.156976e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274435e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.517984 sec - 2,258,663,150 cycles # 3.002 GHz - 3,251,980,093 instructions # 1.44 insn per cycle - 0.809803833 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +TOTAL : 0.520509 sec + 2,251,864,611 cycles # 2.979 GHz + 3,200,076,053 instructions # 1.42 insn per cycle + 0.813049887 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 Avg ME (F77/CUDA) = 2.0288063388516822 Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.389882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.467416e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.467416e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.538728e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.625778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.625778e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.492234 sec - 13,897,256,384 cycles # 3.090 GHz - 35,848,363,284 instructions # 2.58 insn per cycle - 4.498607059 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.235724 sec + 13,018,811,907 cycles # 3.070 GHz + 34,384,492,801 instructions # 2.64 insn per cycle + 4.241723051 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.126893e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.381970e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.381970e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.065411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.209741e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.209741e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.644860 sec - 8,205,375,516 cycles # 3.096 GHz - 21,906,179,334 instructions # 2.67 insn per cycle - 2.651401622 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.527791 sec + 10,618,068,276 cycles # 3.005 GHz + 24,006,297,751 instructions # 2.26 insn per cycle + 3.533644608 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.864313e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.381523e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.381523e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.845204e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.186466e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.186466e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.894123 sec - 5,543,467,549 cycles # 2.919 GHz - 12,075,161,866 instructions # 2.18 insn per cycle - 1.900622314 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3062) (512y: 0) (512z: 0) +TOTAL : 2.268558 sec + 6,594,099,256 cycles # 2.900 GHz + 12,400,446,525 instructions # 1.88 insn per cycle + 2.274329127 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 +Avg ME (F77/C++) = 2.0288063388516200 +Relative difference = 3.2588037208240405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.357489e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.970171e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.970171e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.148118e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.537652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.537652e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.754895 sec - 5,133,966,519 cycles # 2.917 GHz - 11,142,120,432 instructions # 2.17 insn per cycle - 1.761680532 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 224) (512z: 0) +TOTAL : 2.142175 sec + 6,250,159,272 cycles # 2.911 GHz + 11,574,474,977 instructions # 1.85 insn per cycle + 2.148019416 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 +Avg ME (F77/C++) = 2.0288063388516200 +Relative difference = 3.2588037208240405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.405916e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.704474e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.704474e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.139590e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.381511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.381511e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.490028 sec - 4,802,416,326 cycles # 1.924 GHz - 8,842,419,328 instructions # 1.84 insn per cycle - 2.496721986 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 97) (512z: 2034) +TOTAL : 2.637824 sec + 5,343,225,675 cycles # 2.022 GHz + 9,294,792,947 instructions # 1.74 insn per cycle + 2.643638198 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 4e18a74fd4..6740b658ab 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,194 +13,194 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:46:07 +DATE: 2024-03-01_03:04:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.571168e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.164253e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.280238e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.563128e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158314e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275634e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.519386 sec - 2,267,911,515 cycles # 3.016 GHz - 3,251,447,706 instructions # 1.43 insn per cycle - 0.810353004 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +TOTAL : 0.525125 sec + 2,266,508,632 cycles # 2.999 GHz + 3,227,683,893 instructions # 1.42 insn per cycle + 0.815560561 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 Avg ME (F77/CUDA) = 2.0288063388516822 Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.676449e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.774483e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.774483e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.686393e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.784184e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.784184e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.023481 sec - 12,489,396,939 cycles # 3.100 GHz - 35,729,005,545 instructions # 2.86 insn per cycle - 4.030070872 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.008193 sec + 12,350,315,150 cycles # 3.077 GHz + 35,037,181,267 instructions # 2.84 insn per cycle + 4.014100641 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.223056e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.492367e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.492367e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.126314e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.271590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.271590e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.587451 sec - 8,029,022,316 cycles # 3.097 GHz - 21,259,346,955 instructions # 2.65 insn per cycle - 2.593594031 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.458899 sec + 10,688,048,117 cycles # 3.085 GHz + 23,082,662,787 instructions # 2.16 insn per cycle + 3.464737128 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.161232e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.723558e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.723558e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.065386e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.447820e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.447820e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.806475 sec - 5,298,285,942 cycles # 2.924 GHz - 11,405,384,263 instructions # 2.15 insn per cycle - 1.812976231 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2370) (512y: 0) (512z: 0) +TOTAL : 2.175532 sec + 6,167,789,524 cycles # 2.829 GHz + 11,956,365,830 instructions # 1.94 insn per cycle + 2.181490352 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.596114e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.252611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.252611e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.355284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.776167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.776167e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.694412 sec - 4,989,073,893 cycles # 2.936 GHz - 10,598,434,802 instructions # 2.12 insn per cycle - 1.700669389 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1970) (512y: 162) (512z: 0) +TOTAL : 2.062589 sec + 6,012,687,929 cycles # 2.908 GHz + 11,129,506,913 instructions # 1.85 insn per cycle + 2.068524285 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.739884e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.066921e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.066921e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.234665e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.489644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.489644e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.317514 sec - 4,718,237,627 cycles # 2.031 GHz - 8,566,667,295 instructions # 1.82 insn per cycle - 2.324277048 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1392) (512y: 70) (512z: 1630) +TOTAL : 2.581777 sec + 5,215,223,845 cycles # 2.016 GHz + 9,019,923,506 instructions # 1.73 insn per cycle + 2.587755549 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 8caa99d7b5..3164378b7a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:07:18 +DATE: 2024-03-01_02:28:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.268619e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.598652e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.970928e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.210726e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.585567e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.966482e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.479559 sec - 2,115,745,361 cycles # 3.004 GHz - 3,026,067,491 instructions # 1.43 insn per cycle - 0.780309314 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +TOTAL : 0.485254 sec + 2,068,141,298 cycles # 2.904 GHz + 2,916,142,359 instructions # 1.41 insn per cycle + 0.784434250 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028811e+00 Avg ME (F77/CUDA) = 2.0288499749731272 Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.373505e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.452479e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.452479e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.313091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.389644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.389644e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.502538 sec - 13,910,563,154 cycles # 3.086 GHz - 37,077,613,443 instructions # 2.67 insn per cycle - 4.514374712 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.621612 sec + 14,026,409,554 cycles # 3.032 GHz + 38,341,238,705 instructions # 2.73 insn per cycle + 4.632085783 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.415916e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.883206e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.883206e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.217740e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.647077e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.647077e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.020931 sec - 6,164,585,136 cycles # 3.041 GHz - 15,211,692,086 instructions # 2.47 insn per cycle - 2.038780280 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.094155 sec + 6,477,656,873 cycles # 3.085 GHz + 15,815,714,256 instructions # 2.44 insn per cycle + 2.109661469 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.178438e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.056543e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.056543e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.231884 sec - 3,441,665,218 cycles # 2.780 GHz - 7,715,516,449 instructions # 2.24 insn per cycle - 1.250456436 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.558089e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.098648e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.098648e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.180439 sec + 3,464,791,228 cycles # 2.924 GHz + 7,594,553,534 instructions # 2.19 insn per cycle + 1.196926932 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.053344e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.232505e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.232505e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.080384 sec - 3,174,573,431 cycles # 2.923 GHz - 7,109,345,653 instructions # 2.24 insn per cycle - 1.096556108 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.028669e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.195924e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.195924e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.103361 sec + 3,253,544,502 cycles # 2.935 GHz + 7,202,500,133 instructions # 2.21 insn per cycle + 1.115792553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.797220e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.705551e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.705551e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.586127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.450667e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.450667e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.430051 sec - 2,982,263,461 cycles # 2.077 GHz - 5,763,882,815 instructions # 1.93 insn per cycle - 1.451099483 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +TOTAL : 1.467307 sec + 3,062,229,633 cycles # 2.079 GHz + 5,834,823,887 instructions # 1.91 insn per cycle + 1.480044473 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index efebdc8ccf..b32abcb3fe 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:56:28 +DATE: 2024-03-01_03:14:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.147887e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.526108e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.526108e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.139226e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.486374e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.486374e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.661489 sec - 2,693,816,911 cycles # 3.036 GHz - 4,104,498,443 instructions # 1.52 insn per cycle - 0.946519385 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +TOTAL : 0.665285 sec + 2,679,931,908 cycles # 3.001 GHz + 4,173,181,221 instructions # 1.56 insn per cycle + 0.950193790 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,151 +72,151 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028811e+00 Avg ME (F77/CUDA) = 2.0288499749731272 Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.324858e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.402467e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.402467e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.339175e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.415593e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.415593e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.637720 sec - 14,089,911,598 cycles # 3.040 GHz - 37,125,365,010 instructions # 2.63 insn per cycle - 4.644678781 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.608146 sec + 14,198,803,048 cycles # 3.078 GHz + 38,383,841,480 instructions # 2.70 insn per cycle + 4.614561058 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.462474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.934411e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.934411e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.150361e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.574288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.574288e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.047964 sec - 6,366,264,162 cycles # 3.100 GHz - 15,491,842,039 instructions # 2.43 insn per cycle - 2.055155435 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.164951 sec + 6,682,648,138 cycles # 3.079 GHz + 16,095,511,662 instructions # 2.41 insn per cycle + 2.171478460 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.528714e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.093980e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.093980e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.229951 sec - 3,645,466,406 cycles # 2.949 GHz - 7,952,982,935 instructions # 2.18 insn per cycle - 1.237058942 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.377335e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.075060e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.075060e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.245724 sec + 3,655,872,382 cycles # 2.921 GHz + 7,830,960,228 instructions # 2.14 insn per cycle + 1.252058919 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.028940e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.203411e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.203411e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.149866 sec - 3,370,367,591 cycles # 2.919 GHz - 7,347,327,720 instructions # 2.18 insn per cycle - 1.156955542 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.884024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.146718e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.146718e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.193275 sec + 3,439,455,837 cycles # 2.869 GHz + 7,440,735,686 instructions # 2.16 insn per cycle + 1.199824293 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.733701e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.626137e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.626137e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.445766e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.274506e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.274506e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.486462 sec - 3,189,866,182 cycles # 2.137 GHz - 6,021,180,514 instructions # 1.89 insn per cycle - 1.493653390 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +TOTAL : 1.539244 sec + 3,276,504,779 cycles # 2.121 GHz + 6,089,433,455 instructions # 1.86 insn per cycle + 1.545785864 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 7acf133e3f..1418229a2f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_22:09:40 +DATE: 2024-03-01_03:27:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.412916e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.619089e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.939568e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.472574e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.636713e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.962164e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.556731 sec - 2,380,708,912 cycles # 3.021 GHz - 3,476,777,878 instructions # 1.46 insn per cycle - 0.845720258 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +TOTAL : 0.558880 sec + 2,364,095,478 cycles # 3.003 GHz + 3,484,344,192 instructions # 1.47 insn per cycle + 0.845198156 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028811e+00 Avg ME (F77/CUDA) = 2.0288499749731272 Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.373825e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.451993e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.451993e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.553769 sec - 14,061,285,831 cycles # 3.085 GHz - 37,107,009,606 instructions # 2.64 insn per cycle - 4.559663349 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.358072e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.436073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.436073e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.585598 sec + 14,172,267,813 cycles # 3.088 GHz + 38,370,669,897 instructions # 2.71 insn per cycle + 4.590984697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.488905e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.966686e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.966686e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404711e-03 ) GeV^0 -TOTAL : 2.047136 sec - 6,328,275,118 cycles # 3.084 GHz - 15,224,386,903 instructions # 2.41 insn per cycle - 2.052913694 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.211957e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.640936e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.640936e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 2.148796 sec + 6,634,619,629 cycles # 3.081 GHz + 15,827,825,218 instructions # 2.39 insn per cycle + 2.154083020 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.247802e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.063995e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.063995e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.547921e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095970e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.095970e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.274630 sec - 3,616,593,672 cycles # 2.827 GHz - 7,699,869,910 instructions # 2.13 insn per cycle - 1.280330551 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) +TOTAL : 1.236002 sec + 3,624,228,310 cycles # 2.921 GHz + 7,577,923,207 instructions # 2.09 insn per cycle + 1.241371528 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.052801e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.234165e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.234165e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.019099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183109e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.183109e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.134805 sec - 3,350,043,075 cycles # 2.939 GHz - 7,059,303,612 instructions # 2.11 insn per cycle - 1.140687023 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +TOTAL : 1.166800 sec + 3,412,475,771 cycles # 2.913 GHz + 7,154,107,852 instructions # 2.10 insn per cycle + 1.172143118 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.769621e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.675649e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.675649e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.590832e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.447342e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.447342e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.489497 sec - 3,171,881,548 cycles # 2.123 GHz - 5,713,001,657 instructions # 1.80 insn per cycle - 1.495609576 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +TOTAL : 1.519807 sec + 3,228,336,001 cycles # 2.118 GHz + 5,784,936,071 instructions # 1.79 insn per cycle + 1.525231071 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 97dabf5ef5..6cc1ea482a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_22:06:20 +DATE: 2024-03-01_03:24:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.471523e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.653169e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.969998e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.444388e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.637591e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.958095e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.507312 sec - 2,173,619,290 cycles # 2.997 GHz - 3,399,295,191 instructions # 1.56 insn per cycle - 0.785350773 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +TOTAL : 0.506234 sec + 2,151,061,698 cycles # 2.964 GHz + 3,317,932,316 instructions # 1.54 insn per cycle + 0.783859096 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028811e+00 Avg ME (F77/CUDA) = 2.0288499749731272 Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.381142e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.460290e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.460290e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.348187e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.425786e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.425786e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.486749 sec - 13,885,368,304 cycles # 3.091 GHz - 37,077,531,970 instructions # 2.67 insn per cycle - 4.492745730 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.550945 sec + 14,020,959,724 cycles # 3.078 GHz + 38,340,893,799 instructions # 2.73 insn per cycle + 4.556370309 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.472143e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.958884e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.958884e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.084306e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.497288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.497288e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.001306 sec - 6,161,638,049 cycles # 3.071 GHz - 15,211,574,228 instructions # 2.47 insn per cycle - 2.007539907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.146528 sec + 6,470,246,026 cycles # 3.008 GHz + 15,815,477,798 instructions # 2.44 insn per cycle + 2.151761392 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.708734e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.120132e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.120132e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.163460 sec - 3,453,582,210 cycles # 2.956 GHz - 7,714,676,572 instructions # 2.23 insn per cycle - 1.169489643 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.654131e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.108425e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108425e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.168173 sec + 3,446,745,579 cycles # 2.939 GHz + 7,593,552,481 instructions # 2.20 insn per cycle + 1.173417445 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.023571e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.194006e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.194006e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.108590 sec - 3,177,081,334 cycles # 2.853 GHz - 7,108,573,900 instructions # 2.24 insn per cycle - 1.114719814 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.035097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201064e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.201064e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.096134 sec + 3,246,063,667 cycles # 2.949 GHz + 7,201,559,823 instructions # 2.22 insn per cycle + 1.101526557 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.915759e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.846765e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.846765e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.601752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.455480e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.455480e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.408180 sec - 2,979,297,592 cycles # 2.108 GHz - 5,762,569,288 instructions # 1.93 insn per cycle - 1.414266379 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +TOTAL : 1.463312 sec + 3,061,733,109 cycles # 2.086 GHz + 5,833,735,363 instructions # 1.91 insn per cycle + 1.468683964 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index a5259989f5..d1c301e36a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,199 +13,199 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_22:03:04 +DATE: 2024-03-01_03:21:13 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.958404e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.636826e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.959534e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.521212e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.620937e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.942141e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.609516 sec - 2,471,096,266 cycles # 2.985 GHz - 3,829,067,264 instructions # 1.55 insn per cycle - 0.888079293 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +TOTAL : 0.625420 sec + 2,414,961,393 cycles # 2.854 GHz + 3,791,061,685 instructions # 1.57 insn per cycle + 0.904442863 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028811e+00 Avg ME (F77/CUDA) = 2.0288499749731272 Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.336964e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.415294e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.415294e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.328946e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.404018e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.404018e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.572372 sec - 13,889,600,632 cycles # 3.035 GHz - 37,077,606,876 instructions # 2.67 insn per cycle - 4.578241412 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.586154 sec + 14,183,213,679 cycles # 3.090 GHz + 38,341,040,102 instructions # 2.70 insn per cycle + 4.591510537 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.532200e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.012572e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.012572e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.242078e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.670922e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.670922e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.978043 sec - 6,163,217,538 cycles # 3.108 GHz - 15,211,693,777 instructions # 2.47 insn per cycle - 1.984000649 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.084805 sec + 6,467,654,599 cycles # 3.095 GHz + 15,814,952,627 instructions # 2.45 insn per cycle + 2.090234852 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.669054e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112341e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112341e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.167764 sec - 3,437,424,675 cycles # 2.931 GHz - 7,714,739,146 instructions # 2.24 insn per cycle - 1.173813048 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.553311e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.096092e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.096092e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.181028 sec + 3,453,301,700 cycles # 2.913 GHz + 7,593,575,205 instructions # 2.20 insn per cycle + 1.186225517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.060939e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.242481e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.242481e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.072289 sec - 3,171,680,355 cycles # 2.945 GHz - 7,108,503,211 instructions # 2.24 insn per cycle - 1.078280755 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.023252e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188398e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188398e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.108864 sec + 3,247,038,827 cycles # 2.916 GHz + 7,202,168,264 instructions # 2.22 insn per cycle + 1.114391762 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.899146e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.828249e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.828249e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.596256e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.449431e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.449431e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.411176 sec - 2,980,683,396 cycles # 2.105 GHz - 5,762,290,086 instructions # 1.93 insn per cycle - 1.417037826 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +TOTAL : 1.464294 sec + 3,059,603,183 cycles # 2.083 GHz + 5,833,854,527 instructions # 1.91 insn per cycle + 1.469681735 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 5fa43347d8..adc2ed2114 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:07:42 +DATE: 2024-03-01_02:28:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.331904e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.630922e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.023726e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.323457e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.629602e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.019308e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.478203 sec - 2,110,495,883 cycles # 2.993 GHz - 3,011,337,942 instructions # 1.43 insn per cycle - 0.782662262 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +TOTAL : 0.480923 sec + 2,116,431,851 cycles # 3.003 GHz + 3,022,655,895 instructions # 1.43 insn per cycle + 0.777218279 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028811e+00 Avg ME (F77/CUDA) = 2.0288499749731272 Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.399483e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.481184e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.481184e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.299655e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.373045e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.373045e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.454439 sec - 13,804,474,973 cycles # 3.096 GHz - 37,479,357,412 instructions # 2.72 insn per cycle - 4.468292450 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.644587 sec + 14,360,257,758 cycles # 3.089 GHz + 39,833,716,550 instructions # 2.77 insn per cycle + 4.652300252 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288199028000236 +Relative difference = 4.790961076489297e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.226031e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.848097e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.848097e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.819246e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.374211e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.374211e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.767871 sec - 5,476,477,924 cycles # 3.088 GHz - 15,244,208,658 instructions # 2.78 insn per cycle - 1.792416092 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2330) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.888755 sec + 5,601,188,109 cycles # 2.957 GHz + 15,285,931,975 instructions # 2.73 insn per cycle + 1.901754882 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.862323e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.568663e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.568663e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.611743 sec - 4,709,783,450 cycles # 2.912 GHz - 9,849,850,399 instructions # 2.09 insn per cycle - 1.628871192 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3750) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.809980e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.511061e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.511061e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.623137 sec + 4,755,173,593 cycles # 2.919 GHz + 9,735,141,159 instructions # 2.05 insn per cycle + 1.639641207 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180243223906 -Relative difference = 1.1988453753912676e-08 +Avg ME (F77/C++) = 2.0288182108197361 +Relative difference = 1.0391259163456515e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.897868e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.669459e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.669459e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.607965 sec - 4,489,380,710 cycles # 2.782 GHz - 9,202,142,806 instructions # 2.05 insn per cycle - 1.628198262 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3384) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.976796e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.708401e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.708401e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.586631 sec + 4,632,931,570 cycles # 2.912 GHz + 9,326,747,974 instructions # 2.01 insn per cycle + 1.599475417 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180243223906 -Relative difference = 1.1988453753912676e-08 +Avg ME (F77/C++) = 2.0288182108197361 +Relative difference = 1.0391259163456515e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.506579e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.128587e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.128587e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.246902e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.812329e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.812329e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.696440 sec - 3,457,553,359 cycles # 2.031 GHz - 6,874,633,785 instructions # 1.99 insn per cycle - 1.719781829 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 8) (512z: 2261) +TOTAL : 1.762945 sec + 3,668,593,409 cycles # 2.074 GHz + 7,034,535,336 instructions # 1.92 insn per cycle + 1.779301540 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2605) (512y: 12) (512z: 2221) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183217635378 -Relative difference = 1.5859655131013432e-07 +Avg ME (F77/C++) = 2.0288183459779248 +Relative difference = 1.7053177021099307e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 0f0c0cba4d..82aee2242c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:46:32 +DATE: 2024-03-01_03:04:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.468755e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.656223e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.977409e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.193238e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.649659e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.969705e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.476050 sec - 2,139,983,474 cycles # 3.012 GHz - 3,041,923,459 instructions # 1.42 insn per cycle - 0.769270550 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +TOTAL : 0.478757 sec + 2,104,839,063 cycles # 2.996 GHz + 2,995,662,279 instructions # 1.42 insn per cycle + 0.760483148 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028811e+00 Avg ME (F77/CUDA) = 2.0288499749731272 Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.688483e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.789170e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.789170e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.482809e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.574079e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.574079e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.984623 sec - 12,412,838,771 cycles # 3.111 GHz - 34,217,255,304 instructions # 2.76 insn per cycle - 3.991638757 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.311067 sec + 12,598,770,011 cycles # 2.919 GHz + 34,372,549,657 instructions # 2.73 insn per cycle + 4.316594695 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199088536203 -Relative difference = 4.4925808981097166e-08 +Avg ME (F77/C++) = 2.0288199094356969 +Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.428453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.090008e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.090008e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.536780e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.027176e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.027176e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.714349 sec - 5,359,595,376 cycles # 3.117 GHz - 14,586,771,788 instructions # 2.72 insn per cycle - 1.720584618 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.978899 sec + 6,105,197,866 cycles # 3.078 GHz + 14,859,942,037 instructions # 2.43 insn per cycle + 1.984598314 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288192580919713 -Relative difference = 1.2721291123071246e-07 +Avg ME (F77/C++) = 2.0288193803280592 +Relative difference = 1.8746278463897685e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.084909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.076524e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.076524e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.381325 sec - 4,058,929,495 cycles # 2.928 GHz - 9,088,076,266 instructions # 2.24 insn per cycle - 1.387605112 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4501) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.439196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.305375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.305375e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.494763 sec + 4,316,279,907 cycles # 2.878 GHz + 9,028,948,283 instructions # 2.09 insn per cycle + 1.500523975 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180499337614 -Relative difference = 2.4612242975974814e-08 +Avg ME (F77/C++) = 2.0288181999931112 +Relative difference = 9.857617164523888e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.757180e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.937518e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.937518e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.279969 sec - 3,797,061,121 cycles # 2.955 GHz - 8,440,365,534 instructions # 2.22 insn per cycle - 1.285904868 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4043) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.366245e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.235578e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.235578e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.509333 sec + 4,207,142,397 cycles # 2.778 GHz + 8,663,183,236 instructions # 2.06 insn per cycle + 1.515104262 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180499337614 -Relative difference = 2.4612242975974814e-08 +Avg ME (F77/C++) = 2.0288181999931112 +Relative difference = 9.857617164523888e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.095576e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.636466e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.636466e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.816959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.308753e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.308753e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.805353 sec - 3,727,936,926 cycles # 2.059 GHz - 7,571,161,321 instructions # 2.03 insn per cycle - 1.811498455 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3646) (512y: 1) (512z: 2853) +TOTAL : 1.886655 sec + 3,832,564,290 cycles # 2.026 GHz + 7,807,000,610 instructions # 2.04 insn per cycle + 1.892395760 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4419) (512y: 0) (512z: 2556) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183350348845 -Relative difference = 1.6513796936156652e-07 +Avg ME (F77/C++) = 2.0288183246739209 +Relative difference = 1.6003107281264138e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 494b018564..dda1db1b3c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:46:55 +DATE: 2024-03-01_03:05:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.467486e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.714851e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.049410e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.270822e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.690662e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.026451e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.473680 sec - 2,149,151,229 cycles # 3.027 GHz - 3,029,021,746 instructions # 1.41 insn per cycle - 0.767337928 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +TOTAL : 0.478497 sec + 2,092,584,267 cycles # 2.987 GHz + 2,982,481,806 instructions # 1.43 insn per cycle + 0.759974164 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028811e+00 Avg ME (F77/CUDA) = 2.0288499749731272 Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.769244e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.876189e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.876189e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.703982e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.806761e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.806761e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.872564 sec - 11,947,072,264 cycles # 3.082 GHz - 35,407,153,685 instructions # 2.96 insn per cycle - 3.878838879 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.962914 sec + 11,745,545,496 cycles # 2.960 GHz + 35,108,793,810 instructions # 2.99 insn per cycle + 3.968579892 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199088536203 -Relative difference = 4.4925808981097166e-08 +Avg ME (F77/C++) = 2.0288199094356969 +Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.615505e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.365022e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.365022e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.697555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.224866e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.224866e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.673174 sec - 5,080,150,146 cycles # 3.032 GHz - 14,046,773,867 instructions # 2.77 insn per cycle - 1.679714771 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.925244 sec + 5,962,598,726 cycles # 3.089 GHz + 14,469,931,867 instructions # 2.43 insn per cycle + 1.931094914 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288192554144189 -Relative difference = 1.2589315209891237e-07 +Avg ME (F77/C++) = 2.0288193583255634 +Relative difference = 1.7661780742548925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.217955e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.248962e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.248962e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.360539 sec - 3,995,329,762 cycles # 2.926 GHz - 8,629,021,722 instructions # 2.16 insn per cycle - 1.366675172 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3422) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.546151e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.447291e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.447291e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.475701 sec + 4,155,772,808 cycles # 2.809 GHz + 8,874,967,057 instructions # 2.14 insn per cycle + 1.481449825 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180815987289 -Relative difference = 4.021983692325164e-08 +Avg ME (F77/C++) = 2.0288182107033208 +Relative difference = 1.0385521077446488e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.478461e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.623535e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.623535e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.323023 sec - 3,695,157,256 cycles # 2.782 GHz - 8,100,478,864 instructions # 2.19 insn per cycle - 1.329144793 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3105) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.932743e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.882289e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.882289e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.405788 sec + 4,123,527,517 cycles # 2.923 GHz + 8,411,119,259 instructions # 2.04 insn per cycle + 1.411551419 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180815987289 -Relative difference = 4.021983692325164e-08 +Avg ME (F77/C++) = 2.0288182107033208 +Relative difference = 1.0385521077446488e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.395898e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.992762e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.992762e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.930692e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.444813e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.444813e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.723948 sec - 3,575,759,856 cycles # 2.068 GHz - 7,372,918,595 instructions # 2.06 insn per cycle - 1.730165324 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2803) (512y: 1) (512z: 2230) +TOTAL : 1.851731 sec + 3,787,634,254 cycles # 2.040 GHz + 7,699,934,932 instructions # 2.03 insn per cycle + 1.857323010 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3435) (512y: 0) (512z: 2108) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183569209650 -Relative difference = 1.7592557106041962e-07 +Avg ME (F77/C++) = 2.0288183204829693 +Relative difference = 1.5796536184903122e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index c9935b5b0c..9748a5aab4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:08:06 +DATE: 2024-03-01_02:28:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.019007e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.132370e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269812e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.029545e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.136839e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273391e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529781 sec - 2,298,734,230 cycles # 2.997 GHz - 3,298,095,711 instructions # 1.43 insn per cycle - 0.850734215 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +TOTAL : 0.526886 sec + 2,307,341,508 cycles # 3.024 GHz + 3,271,429,537 instructions # 1.42 insn per cycle + 0.836809323 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 Avg ME (F77/CUDA) = 2.0288063423243874 Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.178285e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.242535e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.242535e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.174399e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.238464e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.238464e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.919545 sec - 15,213,150,802 cycles # 3.089 GHz - 39,292,730,130 instructions # 2.58 insn per cycle - 4.935455317 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.926720 sec + 15,303,062,403 cycles # 3.103 GHz + 38,574,821,235 instructions # 2.52 insn per cycle + 4.935986004 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.613457e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.806382e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.806382e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.750432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.964332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.964332e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.007093 sec - 8,851,938,276 cycles # 2.938 GHz - 24,094,532,863 instructions # 2.72 insn per cycle - 3.022798970 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.903163 sec + 8,984,859,488 cycles # 3.089 GHz + 24,224,163,348 instructions # 2.70 insn per cycle + 2.918366508 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.951513e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.479974e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.479974e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.977342e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.518236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.518236e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.867658 sec - 5,498,255,305 cycles # 2.934 GHz - 11,448,732,219 instructions # 2.08 insn per cycle - 1.885253906 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2467) (512y: 0) (512z: 0) +TOTAL : 1.860423 sec + 5,396,289,064 cycles # 2.891 GHz + 11,276,510,611 instructions # 2.09 insn per cycle + 1.875091896 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.941897e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.671977e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.671977e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.792892e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.469147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.469147e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.615140 sec - 4,777,526,896 cycles # 2.947 GHz - 10,317,033,200 instructions # 2.16 insn per cycle - 1.641239162 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2076) (512y: 133) (512z: 0) +TOTAL : 1.648151 sec + 4,836,682,110 cycles # 2.924 GHz + 10,524,586,299 instructions # 2.18 insn per cycle + 1.662467551 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.596452e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.908012e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.908012e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.224142e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.479514e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.479514e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.388979 sec - 4,869,533,602 cycles # 2.034 GHz - 7,366,156,467 instructions # 1.51 insn per cycle - 2.406026106 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1366) (512y: 69) (512z: 1611) +TOTAL : 2.587933 sec + 5,228,382,592 cycles # 2.016 GHz + 7,603,380,674 instructions # 1.45 insn per cycle + 2.604403134 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index f6af1f82c5..4c3bdeb3a7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,197 +13,197 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-02-05_21:08:33 +DATE: 2024-03-01_02:29:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.047759e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.135912e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275542e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.025642e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.140563e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276898e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.530154 sec - 2,242,423,246 cycles # 2.921 GHz - 3,250,635,158 instructions # 1.45 insn per cycle - 0.850969646 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +TOTAL : 0.529654 sec + 2,293,467,091 cycles # 2.992 GHz + 3,241,408,242 instructions # 1.41 insn per cycle + 0.836485234 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 Avg ME (F77/CUDA) = 2.0288063423243874 Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.204740e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.270958e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.270958e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.144775e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.207356e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.207356e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.862249 sec - 15,082,353,353 cycles # 3.099 GHz - 40,115,404,660 instructions # 2.66 insn per cycle - 4.874821925 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.994421 sec + 15,338,753,655 cycles # 3.068 GHz + 40,369,233,372 instructions # 2.63 insn per cycle + 5.002383718 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.851334e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.075206e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.075206e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.003325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.239627e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.239627e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.828378 sec - 8,690,865,791 cycles # 3.066 GHz - 23,534,050,563 instructions # 2.71 insn per cycle - 2.847900143 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.723159 sec + 8,478,435,163 cycles # 3.107 GHz + 23,253,497,249 instructions # 2.74 insn per cycle + 2.738604338 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.250913e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.667762e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.667762e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.181118e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.571113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.571113e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.101952 sec - 6,182,697,935 cycles # 2.933 GHz - 13,102,957,598 instructions # 2.12 insn per cycle - 2.120994795 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) +TOTAL : 2.127824 sec + 6,241,547,842 cycles # 2.925 GHz + 12,962,413,577 instructions # 2.08 insn per cycle + 2.144515260 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.627757e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.100688e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.100688e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.322331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.729304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.729304e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.968170 sec - 5,762,105,462 cycles # 2.919 GHz - 12,210,003,651 instructions # 2.12 insn per cycle - 1.989376159 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2201) (512y: 282) (512z: 0) +TOTAL : 2.074458 sec + 5,923,278,346 cycles # 2.853 GHz + 12,242,730,346 instructions # 2.07 insn per cycle + 2.086429072 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.203452e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.459686e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.459686e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.899734e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.116034e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.116034e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.600019 sec - 5,268,712,876 cycles # 2.022 GHz - 8,448,368,712 instructions # 1.60 insn per cycle - 2.618948581 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1324) (512y: 84) (512z: 1919) +TOTAL : 2.794263 sec + 5,618,790,292 cycles # 2.007 GHz + 8,743,459,975 instructions # 1.56 insn per cycle + 2.808786612 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 26d1384e8b..c4c4bff630 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-05_21:09:01 +DATE: 2024-03-01_02:29:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.517520e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.050894e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.067550e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.473707e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045050e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.061478e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.463659 sec - 2,055,183,874 cycles # 3.003 GHz - 2,934,312,859 instructions # 1.43 insn per cycle - 0.756953193 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +TOTAL : 0.463329 sec + 2,069,832,304 cycles # 3.002 GHz + 2,918,096,235 instructions # 1.41 insn per cycle + 0.772559551 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.043868e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.322139e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.339141e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.045387e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.319438e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336268e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.609822 sec - 2,563,240,653 cycles # 3.017 GHz - 3,868,983,956 instructions # 1.51 insn per cycle - 0.911526050 seconds time elapsed +TOTAL : 0.608947 sec + 2,562,374,732 cycles # 3.012 GHz + 3,879,371,783 instructions # 1.51 insn per cycle + 0.910123971 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 Avg ME (F77/CUDA) = 1.4131213684418649 Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.614247e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.626993e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.626993e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.585844e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.598254e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.598254e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.289889 sec - 19,495,538,491 cycles # 3.098 GHz - 57,922,287,734 instructions # 2.97 insn per cycle - 6.296761383 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.359535 sec + 19,687,428,773 cycles # 3.094 GHz + 59,604,296,849 instructions # 3.03 insn per cycle + 6.365859123 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432431 -Relative difference = 4.4692302355460254e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.011387e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.057745e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.057745e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.691737e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.735631e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.735631e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.292197 sec - 10,190,979,957 cycles # 3.092 GHz - 29,943,491,460 instructions # 2.94 insn per cycle - 3.306811606 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.515479 sec + 10,373,655,779 cycles # 2.948 GHz + 30,676,465,519 instructions # 2.96 insn per cycle + 3.528584808 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.433832e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.604416e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.604416e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.754839e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.932602e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.932602e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.759958 sec - 4,910,813,970 cycles # 2.783 GHz - 11,211,044,010 instructions # 2.28 insn per cycle - 1.771404819 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) +TOTAL : 1.702212 sec + 4,885,421,396 cycles # 2.863 GHz + 11,020,224,832 instructions # 2.26 insn per cycle + 1.717667988 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.126386e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.149735e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.149735e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.095884e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.117707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.117707e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.477093 sec - 4,299,259,698 cycles # 2.902 GHz - 10,187,392,743 instructions # 2.37 insn per cycle - 1.492930763 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) +TOTAL : 1.517268 sec + 4,368,757,303 cycles # 2.872 GHz + 10,296,904,442 instructions # 2.36 insn per cycle + 1.532957385 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.244237e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.368358e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.368358e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.761348e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.875289e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.875289e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.011538 sec - 3,897,655,272 cycles # 1.933 GHz - 5,708,540,517 instructions # 1.46 insn per cycle - 2.026194749 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) +TOTAL : 2.135983 sec + 4,101,318,849 cycles # 1.917 GHz + 5,843,401,136 instructions # 1.42 insn per cycle + 2.151041040 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index e40acb18da..7a80a6327c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-05_21:56:52 +DATE: 2024-03-01_03:14:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.636814e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.747936e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.747936e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.634181e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.802665e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.802665e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.488125 sec - 2,115,646,575 cycles # 3.006 GHz - 3,165,191,660 instructions # 1.50 insn per cycle - 0.761567703 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +TOTAL : 0.494713 sec + 2,059,588,733 cycles # 2.926 GHz + 3,067,379,574 instructions # 1.49 insn per cycle + 0.764554853 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,166 +72,166 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.698370e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.508650e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.508650e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.715023e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.440232e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.440232e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.819811 sec - 3,231,643,804 cycles # 3.017 GHz - 5,170,954,794 instructions # 1.60 insn per cycle - 1.129350867 seconds time elapsed +TOTAL : 0.824199 sec + 3,179,114,916 cycles # 2.965 GHz + 5,069,610,946 instructions # 1.59 insn per cycle + 1.133521853 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 Avg ME (F77/CUDA) = 1.4131213684418649 Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.586265e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.599432e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.599432e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.525402e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.537809e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.537809e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.365086 sec - 19,535,294,271 cycles # 3.067 GHz - 57,927,366,883 instructions # 2.97 insn per cycle - 6.370310036 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.518056 sec + 19,750,480,394 cycles # 3.028 GHz + 59,611,727,500 instructions # 3.02 insn per cycle + 6.522447301 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432431 -Relative difference = 4.4692302355460254e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.890841e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.937483e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.937483e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.903232e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.949588e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.949588e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.379934 sec - 10,238,179,748 cycles # 3.026 GHz - 29,994,338,386 instructions # 2.93 insn per cycle - 3.385379622 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.370584 sec + 10,396,817,898 cycles # 3.081 GHz + 30,723,473,589 instructions # 2.96 insn per cycle + 3.375008450 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.771282e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.950160e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.950160e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.888216e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.006946e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.006946e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.706937 sec - 4,944,909,058 cycles # 2.891 GHz - 11,258,821,376 instructions # 2.28 insn per cycle - 1.712018011 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) +TOTAL : 1.685691 sec + 4,902,930,827 cycles # 2.902 GHz + 11,066,989,869 instructions # 2.26 insn per cycle + 1.690115997 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.116302e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.139544e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.139544e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.103682e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.126401e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.126401e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.497508 sec - 4,331,250,443 cycles # 2.884 GHz - 10,237,737,318 instructions # 2.36 insn per cycle - 1.502820787 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) +TOTAL : 1.513774 sec + 4,402,683,305 cycles # 2.901 GHz + 10,346,890,880 instructions # 2.35 insn per cycle + 1.518250177 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.218152e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.344259e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.344259e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.798042e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.913691e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.913691e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.025386 sec - 3,933,406,562 cycles # 1.938 GHz - 5,747,129,673 instructions # 1.46 insn per cycle - 2.030424043 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) +TOTAL : 2.132010 sec + 4,131,468,761 cycles # 1.935 GHz + 5,881,941,509 instructions # 1.42 insn per cycle + 2.136586909 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index e6e9b880f5..90bf6e6455 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-05_21:09:30 +DATE: 2024-03-01_02:30:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.433452e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.035720e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.052077e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.404765e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.032804e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.048930e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.461879 sec - 2,049,463,501 cycles # 3.004 GHz - 2,931,930,967 instructions # 1.43 insn per cycle - 0.758516650 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +TOTAL : 0.465265 sec + 2,029,896,808 cycles # 2.980 GHz + 2,854,741,238 instructions # 1.41 insn per cycle + 0.763772288 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.036270e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.309299e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325891e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.033730e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.306062e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.322624e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.604488 sec - 2,554,084,852 cycles # 3.010 GHz - 3,806,222,274 instructions # 1.49 insn per cycle - 0.908408851 seconds time elapsed +TOTAL : 0.607194 sec + 2,545,937,909 cycles # 2.996 GHz + 3,826,405,631 instructions # 1.50 insn per cycle + 0.909330494 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 Avg ME (F77/CUDA) = 1.4131213684418649 Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.603637e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.616413e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.616413e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.602792e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.615496e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.615496e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.315367 sec - 19,440,595,699 cycles # 3.076 GHz - 57,746,273,687 instructions # 2.97 insn per cycle - 6.322559095 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.317260 sec + 19,445,883,412 cycles # 3.076 GHz + 58,795,735,881 instructions # 3.02 insn per cycle + 6.323702590 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432431 -Relative difference = 4.4692302355460254e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.011637e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.057908e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.057908e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.903926e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.950247e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.950247e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.291611 sec - 10,256,692,556 cycles # 3.113 GHz - 30,334,472,962 instructions # 2.96 insn per cycle - 3.303684492 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.363533 sec + 10,256,448,579 cycles # 3.046 GHz + 30,347,165,405 instructions # 2.96 insn per cycle + 3.377280590 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.549169e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.717858e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.717858e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.598787e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.768674e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.768674e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.738062 sec - 5,053,352,211 cycles # 2.901 GHz - 11,664,563,199 instructions # 2.31 insn per cycle - 1.751875656 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4489) (512y: 0) (512z: 0) +TOTAL : 1.728674 sec + 5,043,692,461 cycles # 2.911 GHz + 11,484,727,811 instructions # 2.28 insn per cycle + 1.738921569 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.049432e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.069832e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.069832e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.033952e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.054066e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054066e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.583348 sec - 4,610,661,316 cycles # 2.904 GHz - 10,805,809,627 instructions # 2.34 insn per cycle - 1.595035145 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3988) (512y: 237) (512z: 0) +TOTAL : 1.607009 sec + 4,642,681,786 cycles # 2.882 GHz + 10,842,961,046 instructions # 2.34 insn per cycle + 1.618440779 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.147443e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.270030e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.270030e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.765124e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.875111e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.875111e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.035235 sec - 3,947,076,552 cycles # 1.936 GHz - 5,998,684,941 instructions # 1.52 insn per cycle - 2.049078012 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1241) (512y: 81) (512z: 3500) +TOTAL : 2.134046 sec + 4,109,311,958 cycles # 1.922 GHz + 6,106,472,133 instructions # 1.49 insn per cycle + 2.145705149 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 220563e4c6..af4f474b65 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-05_21:09:59 +DATE: 2024-03-01_02:30:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.389953e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.379235e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.499735e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.308616e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.230427e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.340211e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.445751 sec - 1,974,295,648 cycles # 2.963 GHz - 2,744,935,553 instructions # 1.39 insn per cycle - 0.739534699 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +TOTAL : 0.445727 sec + 2,001,558,197 cycles # 3.000 GHz + 2,820,746,449 instructions # 1.41 insn per cycle + 0.736568143 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.048502e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.406399e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.505283e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.061859e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.424190e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.524056e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.496598 sec - 2,171,209,286 cycles # 2.997 GHz - 3,099,694,880 instructions # 1.43 insn per cycle - 0.782136858 seconds time elapsed +TOTAL : 0.500107 sec + 2,158,124,631 cycles # 2.977 GHz + 3,092,829,809 instructions # 1.43 insn per cycle + 0.784432881 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.412608e+00 Avg ME (F77/CUDA) = 1.4132214346515752 Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.797017e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.812360e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.812360e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.674607e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.688116e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.688116e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 5.878452 sec - 18,166,915,369 cycles # 3.088 GHz - 55,237,437,169 instructions # 3.04 insn per cycle - 5.886190167 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.146873 sec + 19,061,096,774 cycles # 3.099 GHz + 58,958,014,215 instructions # 3.09 insn per cycle + 6.153306662 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858051842916 +Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.039866e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.198983e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.198983e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.781065e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.932207e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.932207e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.832280 sec - 5,681,092,016 cycles # 3.093 GHz - 16,128,089,686 instructions # 2.84 insn per cycle - 1.846415672 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.886682 sec + 5,850,782,122 cycles # 3.096 GHz + 16,695,269,066 instructions # 2.85 insn per cycle + 1.898716135 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129864902818952 -Relative difference = 3.469828399449743e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412987e+00 +Avg ME (F77/C++) = 1.4129865669244737 +Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.890873e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.959141e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.959141e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.892145e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.960485e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.960485e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.887375 sec - 2,587,042,735 cycles # 2.901 GHz - 6,085,514,363 instructions # 2.35 insn per cycle - 0.899074779 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) +TOTAL : 0.886334 sec + 2,581,461,055 cycles # 2.900 GHz + 5,980,838,355 instructions # 2.32 insn per cycle + 0.901108038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.130938e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.235290e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.235290e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.036523e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.118274e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.118274e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.789645 sec - 2,308,576,199 cycles # 2.908 GHz - 5,552,964,613 instructions # 2.41 insn per cycle - 0.803204840 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) +TOTAL : 0.825324 sec + 2,349,134,788 cycles # 2.832 GHz + 5,603,128,082 instructions # 2.39 insn per cycle + 0.837493797 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.637248e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.690128e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.690128e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.468368e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.511305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.511305e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.023178 sec - 2,012,076,992 cycles # 1.958 GHz - 3,285,913,321 instructions # 1.63 insn per cycle - 1.035051327 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) +TOTAL : 1.138775 sec + 2,054,810,359 cycles # 1.798 GHz + 3,334,038,485 instructions # 1.62 insn per cycle + 1.149410848 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 +Avg ME (F77/C++) = 1.4133164033579249 +Relative difference = 2.85398258307829e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index f99fe56362..f62f4c8cdf 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-05_21:57:21 +DATE: 2024-03-01_03:15:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.982111e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.119833e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.119833e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.995753e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.112595e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.112595e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.457183 sec - 1,967,411,727 cycles # 2.939 GHz - 2,909,797,751 instructions # 1.48 insn per cycle - 0.727045964 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +TOTAL : 0.451281 sec + 1,977,131,537 cycles # 2.986 GHz + 2,910,150,577 instructions # 1.47 insn per cycle + 0.718929629 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,169 +72,169 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.787238e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.561461e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.561461e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.708417e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.567455e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.567455e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.635665 sec - 2,588,344,996 cycles # 2.985 GHz - 3,973,729,693 instructions # 1.54 insn per cycle - 0.925601980 seconds time elapsed +TOTAL : 0.637857 sec + 2,608,085,808 cycles # 2.999 GHz + 3,961,129,191 instructions # 1.52 insn per cycle + 0.928114705 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.412608e+00 Avg ME (F77/CUDA) = 1.4132214346515752 Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.799107e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.814416e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.814416e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.667614e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.681311e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.681311e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 5.876793 sec - 18,189,272,845 cycles # 3.093 GHz - 55,241,417,370 instructions # 3.04 insn per cycle - 5.881598403 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.166590 sec + 19,068,958,964 cycles # 3.091 GHz + 58,962,429,433 instructions # 3.09 insn per cycle + 6.170849448 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858051842916 +Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.047168e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.206962e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.206962e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.742153e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.893438e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.893438e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.835268 sec - 5,698,652,492 cycles # 3.098 GHz - 16,175,394,330 instructions # 2.84 insn per cycle - 1.840169365 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.898339 sec + 5,876,062,473 cycles # 3.090 GHz + 16,741,995,731 instructions # 2.85 insn per cycle + 1.902713080 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129864902818952 -Relative difference = 3.469828399449743e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412987e+00 +Avg ME (F77/C++) = 1.4129865669244737 +Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.885686e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.953537e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.953537e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.880787e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949754e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949754e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.893142 sec - 2,601,913,974 cycles # 2.900 GHz - 6,121,802,998 instructions # 2.35 insn per cycle - 0.897968889 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) +TOTAL : 0.895765 sec + 2,600,620,319 cycles # 2.891 GHz + 6,016,590,564 instructions # 2.31 insn per cycle + 0.900189489 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.113688e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.201012e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.201012e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.084629e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.167676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.167676e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.799670 sec - 2,307,090,883 cycles # 2.871 GHz - 5,589,004,785 instructions # 2.42 insn per cycle - 0.804426909 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) +TOTAL : 0.810420 sec + 2,363,958,510 cycles # 2.904 GHz + 5,639,045,986 instructions # 2.39 insn per cycle + 0.814799834 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.486631e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.532097e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.532097e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.603454e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.652417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.652417e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.130259 sec - 2,034,132,380 cycles # 1.793 GHz - 3,327,384,092 instructions # 1.64 insn per cycle - 1.135408818 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) +TOTAL : 1.048212 sec + 2,071,251,869 cycles # 1.970 GHz + 3,374,799,702 instructions # 1.63 insn per cycle + 1.052574627 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 +Avg ME (F77/C++) = 1.4133164033579249 +Relative difference = 2.85398258307829e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 06f543fbee..b43a9401e8 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-05_21:10:24 +DATE: 2024-03-01_02:31:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.384648e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.366733e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.479160e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.359219e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.312667e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.422625e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.443785 sec - 1,993,953,840 cycles # 3.008 GHz - 2,818,381,366 instructions # 1.41 insn per cycle - 0.736398900 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +TOTAL : 0.446885 sec + 1,972,174,797 cycles # 2.962 GHz + 2,746,314,290 instructions # 1.39 insn per cycle + 0.738224654 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.037533e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.378680e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.476282e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.060800e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.419962e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.520064e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.496956 sec - 2,155,618,409 cycles # 2.983 GHz - 3,013,698,932 instructions # 1.40 insn per cycle - 0.781903701 seconds time elapsed +TOTAL : 0.497273 sec + 2,176,246,033 cycles # 3.004 GHz + 3,133,180,341 instructions # 1.44 insn per cycle + 0.782102946 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.412608e+00 Avg ME (F77/CUDA) = 1.4132214346515752 Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.794393e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.809440e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.809440e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.676079e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.689805e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.689805e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 5.884011 sec - 18,141,446,314 cycles # 3.081 GHz - 54,990,057,284 instructions # 3.03 insn per cycle - 5.891015557 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.143350 sec + 18,995,848,931 cycles # 3.090 GHz + 58,700,265,502 instructions # 3.09 insn per cycle + 6.150073952 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858051842916 +Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.239102e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.406607e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.406607e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.180884e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.346917e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.346917e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.793435 sec - 5,530,022,881 cycles # 3.076 GHz - 16,222,893,517 instructions # 2.93 insn per cycle - 1.806863598 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.804269 sec + 5,584,642,506 cycles # 3.088 GHz + 16,510,962,038 instructions # 2.96 insn per cycle + 1.819572816 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129863487235070 -Relative difference = 2.4679898241023883e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412987e+00 +Avg ME (F77/C++) = 1.4129865669244737 +Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.638710e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.689089e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.689089e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.634306e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.685973e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.685973e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.020116 sec - 2,973,003,499 cycles # 2.902 GHz - 6,707,847,761 instructions # 2.26 insn per cycle - 1.032449924 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5430) (512y: 0) (512z: 0) +TOTAL : 1.022630 sec + 2,975,513,176 cycles # 2.898 GHz + 6,634,498,276 instructions # 2.23 insn per cycle + 1.034400565 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.802877e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.865060e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.865060e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.769784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.829611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.829611e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.929310 sec - 2,704,404,518 cycles # 2.897 GHz - 6,222,577,733 instructions # 2.30 insn per cycle - 0.944067617 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5056) (512y: 24) (512z: 0) +TOTAL : 0.945795 sec + 2,752,522,160 cycles # 2.898 GHz + 6,256,039,450 instructions # 2.27 insn per cycle + 0.961442115 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.524110e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.568915e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.568915e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.392018e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.430701e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.430701e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.097370 sec - 2,152,540,827 cycles # 1.954 GHz - 3,642,461,294 instructions # 1.69 insn per cycle - 1.111654079 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2070) (512y: 21) (512z: 3922) +TOTAL : 1.200320 sec + 2,230,572,619 cycles # 1.852 GHz + 3,698,329,997 instructions # 1.66 insn per cycle + 1.213663484 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2378) (512y: 29) (512z: 3963) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 +Avg ME (F77/C++) = 1.4133164033579249 +Relative difference = 2.85398258307829e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 10aee9994d..568d6c4513 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-05_21:10:49 +DATE: 2024-03-01_02:31:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.411052e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.037219e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.053212e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.426575e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.039569e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055629e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.462353 sec - 2,044,752,105 cycles # 3.002 GHz - 2,922,291,440 instructions # 1.43 insn per cycle - 0.754029831 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +TOTAL : 0.463709 sec + 2,071,639,040 cycles # 3.004 GHz + 2,941,031,538 instructions # 1.42 insn per cycle + 0.764842159 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.037940e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.312467e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.329162e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.035948e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309187e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325703e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.607140 sec - 2,555,586,136 cycles # 3.009 GHz - 3,834,621,246 instructions # 1.50 insn per cycle - 0.907842423 seconds time elapsed +TOTAL : 0.608855 sec + 2,552,084,280 cycles # 3.004 GHz + 3,794,047,088 instructions # 1.49 insn per cycle + 0.909216297 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 Avg ME (F77/CUDA) = 1.4131213755569487 Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.525140e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.537527e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.537527e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.546543e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.558753e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.558753e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.512388 sec - 19,949,166,194 cycles # 3.062 GHz - 59,159,604,988 instructions # 2.97 insn per cycle - 6.519783633 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.456566 sec + 20,000,355,725 cycles # 3.096 GHz + 60,532,425,335 instructions # 3.03 insn per cycle + 6.462989015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.087383e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.135240e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.135240e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.015629e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.062224e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.062224e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.246233 sec - 10,091,680,067 cycles # 3.109 GHz - 29,764,233,813 instructions # 2.95 insn per cycle - 3.262583152 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.288178 sec + 10,191,043,016 cycles # 3.096 GHz + 30,384,591,666 instructions # 2.98 insn per cycle + 3.302408299 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.852319e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.003597e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.003597e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.844182e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.002719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.002719e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.685614 sec - 4,874,885,089 cycles # 2.885 GHz - 11,201,070,619 instructions # 2.30 insn per cycle - 1.697607709 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4581) (512y: 0) (512z: 0) +TOTAL : 1.686926 sec + 4,874,678,301 cycles # 2.883 GHz + 10,979,160,826 instructions # 2.25 insn per cycle + 1.698730583 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.141882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.165680e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.165680e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.132241e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155783e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.155783e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.457259 sec - 4,227,831,629 cycles # 2.893 GHz - 10,145,806,984 instructions # 2.40 insn per cycle - 1.470346665 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4064) (512y: 73) (512z: 0) +TOTAL : 1.469271 sec + 4,278,421,569 cycles # 2.904 GHz + 10,248,685,624 instructions # 2.40 insn per cycle + 1.480280367 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.008468e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.125280e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.125280e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.587751e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.694540e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.694540e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.070196 sec - 3,997,404,384 cycles # 1.927 GHz - 5,838,748,265 instructions # 1.46 insn per cycle - 2.086574272 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1778) (512y: 97) (512z: 3502) +TOTAL : 2.183850 sec + 4,204,822,902 cycles # 1.923 GHz + 6,044,506,630 instructions # 1.44 insn per cycle + 2.192719745 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 +Avg ME (F77/C++) = 1.4131213786174055 +Relative difference = 4.3972324717191576e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index e07e294a1d..2001d2a062 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-02-05_21:11:18 +DATE: 2024-03-01_02:32:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.435966e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.038196e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054495e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.409979e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.033107e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.049247e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.462791 sec - 2,083,546,367 cycles # 3.014 GHz - 2,972,290,724 instructions # 1.43 insn per cycle - 0.769446480 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +TOTAL : 0.461655 sec + 2,079,301,655 cycles # 3.013 GHz + 2,945,288,445 instructions # 1.42 insn per cycle + 0.761228896 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.031258e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.302322e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.318791e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.037338e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.304237e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318241e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.605517 sec - 2,552,548,170 cycles # 3.005 GHz - 3,840,818,275 instructions # 1.50 insn per cycle - 0.908380997 seconds time elapsed +TOTAL : 0.603998 sec + 2,550,056,991 cycles # 3.016 GHz + 3,770,712,997 instructions # 1.48 insn per cycle + 0.905342631 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 Avg ME (F77/CUDA) = 1.4131213755569487 Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.591923e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.604576e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.604576e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.536387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.548597e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.548597e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.344174 sec - 19,690,002,782 cycles # 3.102 GHz - 58,706,037,230 instructions # 2.98 insn per cycle - 6.351179172 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.482109 sec + 19,897,203,281 cycles # 3.068 GHz + 59,934,079,759 instructions # 3.01 insn per cycle + 6.488470935 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.074664e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.121928e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.121928e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.079933e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.127366e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.127366e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.250511 sec - 10,100,144,581 cycles # 3.103 GHz - 30,158,060,846 instructions # 2.99 insn per cycle - 3.262078242 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.246582 sec + 10,068,513,741 cycles # 3.097 GHz + 30,097,905,174 instructions # 2.99 insn per cycle + 3.264343936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.144723e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.313489e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.313489e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.599229e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.768469e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.768469e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.815494 sec - 5,030,331,211 cycles # 2.763 GHz - 11,663,521,674 instructions # 2.32 insn per cycle - 1.831265530 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4685) (512y: 0) (512z: 0) +TOTAL : 1.728964 sec + 5,016,079,762 cycles # 2.895 GHz + 11,483,054,886 instructions # 2.29 insn per cycle + 1.742427809 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.066889e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.088085e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.088085e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.051243e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.071758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071758e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.557471 sec - 4,540,720,285 cycles # 2.907 GHz - 10,787,106,557 instructions # 2.38 insn per cycle - 1.573334698 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4159) (512y: 233) (512z: 0) +TOTAL : 1.580395 sec + 4,590,869,899 cycles # 2.898 GHz + 10,811,034,467 instructions # 2.35 insn per cycle + 1.596114627 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.801981e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.913458e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.913458e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.586932e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.694563e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.694563e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.124298 sec - 4,048,675,940 cycles # 1.902 GHz - 6,072,800,594 instructions # 1.50 insn per cycle - 2.136798569 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1725) (512y: 104) (512z: 3609) +TOTAL : 2.184061 sec + 4,216,157,602 cycles # 1.927 GHz + 6,273,944,868 instructions # 1.49 insn per cycle + 2.195028764 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 +Avg ME (F77/C++) = 1.4131213786174055 +Relative difference = 4.3972324717191576e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 4386f9eaa6..c4f627d4b9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:11:47 +DATE: 2024-03-01_02:32:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.496016e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.529637e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.532403e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.456101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.491439e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.525349 sec - 2,294,512,437 cycles # 2.993 GHz - 3,492,349,564 instructions # 1.52 insn per cycle - 0.840403466 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +TOTAL : 0.526891 sec + 2,312,216,646 cycles # 3.007 GHz + 3,538,385,257 instructions # 1.53 insn per cycle + 0.841955777 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.126320e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.166438e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.167839e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.122556e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.158071e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.159487e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.035102 sec - 10,125,737,942 cycles # 3.073 GHz - 21,079,378,058 instructions # 2.08 insn per cycle - 3.352111070 seconds time elapsed +TOTAL : 3.037875 sec + 10,086,152,870 cycles # 3.059 GHz + 22,511,661,776 instructions # 2.23 insn per cycle + 3.352868148 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266731198158133E-004 Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.913785e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.914720e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914720e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.962967e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.963888e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.963888e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.579787 sec - 26,438,116,213 cycles # 3.080 GHz - 81,752,514,663 instructions # 3.09 insn per cycle - 8.587422083 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.365178 sec + 25,629,682,297 cycles # 3.063 GHz + 78,935,463,104 instructions # 3.08 insn per cycle + 8.371779038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.829757e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.833256e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.833256e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.775994e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.779313e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.779313e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.292364 sec - 12,887,054,299 cycles # 2.999 GHz - 39,241,639,513 instructions # 3.05 insn per cycle - 4.302239285 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.352554 sec + 12,920,825,541 cycles # 2.966 GHz + 39,280,019,197 instructions # 3.04 insn per cycle + 4.370436126 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.577458e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.594978e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.594978e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.587371e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.605210e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.605210e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.923283 sec - 5,565,898,996 cycles # 2.889 GHz - 13,789,682,242 instructions # 2.48 insn per cycle - 1.937602465 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +TOTAL : 1.920439 sec + 5,577,220,412 cycles # 2.899 GHz + 13,686,699,383 instructions # 2.45 insn per cycle + 1.933532640 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.748302e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.771041e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.771041e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.660129e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.682450e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.682450e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.692940 sec - 4,898,171,848 cycles # 2.887 GHz - 12,318,410,362 instructions # 2.51 insn per cycle - 1.708522131 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +TOTAL : 1.708010 sec + 4,898,677,790 cycles # 2.863 GHz + 12,341,670,637 instructions # 2.52 insn per cycle + 1.722166284 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.787552e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.802476e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.802476e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.531084e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.544719e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.544719e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.115967 sec - 4,056,542,434 cycles # 1.913 GHz - 6,286,251,479 instructions # 1.55 insn per cycle - 2.129797525 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +TOTAL : 2.187284 sec + 4,109,191,778 cycles # 1.875 GHz + 6,335,550,253 instructions # 1.54 insn per cycle + 2.200752564 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index eb520cfa63..8d1778e673 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:58:20 +DATE: 2024-03-01_03:16:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.145693e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.489458e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.489458e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.142985e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.469804e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.469804e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.517339 sec - 2,180,189,349 cycles # 2.926 GHz - 3,535,379,038 instructions # 1.62 insn per cycle - 0.806007511 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +TOTAL : 0.511155 sec + 2,228,194,908 cycles # 3.016 GHz + 3,541,287,827 instructions # 1.59 insn per cycle + 0.799045956 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,169 +72,169 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.625721e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.093314e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.093314e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.621948e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.093950e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.093950e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.299968 sec - 11,007,925,791 cycles # 3.081 GHz - 23,447,985,186 instructions # 2.13 insn per cycle - 3.629584738 seconds time elapsed +TOTAL : 3.305480 sec + 10,998,775,521 cycles # 3.077 GHz + 24,493,841,360 instructions # 2.23 insn per cycle + 3.633710964 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266731198158133E-004 Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.909621e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.910509e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.910509e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.956691e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.957671e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.957671e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.601768 sec - 26,468,647,489 cycles # 3.079 GHz - 81,762,792,143 instructions # 3.09 insn per cycle - 8.606858859 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.395628 sec + 25,661,453,890 cycles # 3.059 GHz + 78,946,626,848 instructions # 3.08 insn per cycle + 8.400144517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.825773e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.829632e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.829632e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.779486e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.783121e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.783121e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.300735 sec - 12,906,768,826 cycles # 2.998 GHz - 39,253,927,939 instructions # 3.04 insn per cycle - 4.305912555 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.352704 sec + 12,939,532,043 cycles # 2.970 GHz + 39,292,271,047 instructions # 3.04 insn per cycle + 4.357352756 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.611352e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.629243e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.629243e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.560149e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.578951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.578951e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.918353 sec - 5,573,969,343 cycles # 2.899 GHz - 13,798,757,056 instructions # 2.48 insn per cycle - 1.923671163 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +TOTAL : 1.929060 sec + 5,589,750,479 cycles # 2.892 GHz + 13,696,577,373 instructions # 2.45 insn per cycle + 1.933630865 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.728272e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.752598e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.752598e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.749338e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.772565e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.772565e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.699696 sec - 4,912,735,798 cycles # 2.884 GHz - 12,327,911,686 instructions # 2.51 insn per cycle - 1.704745554 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +TOTAL : 1.695619 sec + 4,910,055,408 cycles # 2.889 GHz + 12,351,492,799 instructions # 2.52 insn per cycle + 1.700097015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.798492e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.814095e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.814095e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.621116e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.636094e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.636094e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.117095 sec - 4,066,575,770 cycles # 1.917 GHz - 6,296,588,952 instructions # 1.55 insn per cycle - 2.122327452 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +TOTAL : 2.165843 sec + 4,123,850,554 cycles # 1.901 GHz + 6,345,407,560 instructions # 1.54 insn per cycle + 2.170297070 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 7d82001653..597fd5665a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_22:10:04 +DATE: 2024-03-01_03:28:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.512961e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.540583e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.542758e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.502974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.532224e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534544e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.505487 sec - 2,244,702,706 cycles # 3.012 GHz - 3,497,665,121 instructions # 1.56 insn per cycle - 0.813818087 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common +TOTAL : 0.505991 sec + 2,242,092,583 cycles # 3.014 GHz + 3,466,791,908 instructions # 1.55 insn per cycle + 0.811853126 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.142807e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.177113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.178528e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.137461e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.171030e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.172456e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.116341 sec - 10,381,161,571 cycles # 3.080 GHz - 21,983,616,246 instructions # 2.12 insn per cycle - 3.426998706 seconds time elapsed +TOTAL : 3.124130 sec + 10,356,034,147 cycles # 3.069 GHz + 23,417,816,833 instructions # 2.26 insn per cycle + 3.433693053 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266731198158133E-004 Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.926994e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.927950e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.927950e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.957351e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.958278e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.958278e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.522111 sec - 26,416,984,323 cycles # 3.099 GHz - 81,751,260,197 instructions # 3.09 insn per cycle - 8.526902879 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.389537 sec + 25,646,805,438 cycles # 3.056 GHz + 78,935,262,340 instructions # 3.08 insn per cycle + 8.393631651 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.824894e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.828545e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.828545e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.762997e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.766514e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.766514e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.302603 sec - 12,900,803,176 cycles # 2.997 GHz - 39,242,002,424 instructions # 3.04 insn per cycle - 4.307598417 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.369422 sec + 12,916,153,129 cycles # 2.954 GHz + 39,278,867,860 instructions # 3.04 insn per cycle + 4.373667823 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.654357e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.672586e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.672586e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.528032e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.546362e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.546362e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.906130 sec - 5,555,557,944 cycles # 2.911 GHz - 13,787,634,047 instructions # 2.48 insn per cycle - 1.910883796 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +TOTAL : 1.933878 sec + 5,580,678,683 cycles # 2.881 GHz + 13,684,529,284 instructions # 2.45 insn per cycle + 1.937965494 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.550673e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.573296e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.573296e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.723484e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.746463e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.746463e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.737685 sec - 4,902,417,602 cycles # 2.830 GHz - 12,315,886,492 instructions # 2.51 insn per cycle - 1.742410859 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +TOTAL : 1.697628 sec + 4,903,453,092 cycles # 2.882 GHz + 12,338,806,795 instructions # 2.52 insn per cycle + 1.701856837 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.807297e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.822744e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.822744e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.314965e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.328200e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.328200e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.112283 sec - 4,060,879,337 cycles # 1.919 GHz - 6,283,466,586 instructions # 1.55 insn per cycle - 2.117099643 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +TOTAL : 2.253262 sec + 4,111,107,725 cycles # 1.822 GHz + 6,332,329,650 instructions # 1.54 insn per cycle + 2.257544828 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 6bafd9bab6..60e01cd2dd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_22:06:43 +DATE: 2024-03-01_03:24:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.480734e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.510074e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.512231e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.510827e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.539312e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.541615e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.506367 sec - 2,239,917,145 cycles # 3.011 GHz - 3,475,727,316 instructions # 1.55 insn per cycle - 0.813688193 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst +TOTAL : 0.503655 sec + 2,239,000,994 cycles # 3.024 GHz + 3,553,306,239 instructions # 1.59 insn per cycle + 0.813367897 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.146935e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181383e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.182805e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.145153e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.179407e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.180837e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.063188 sec - 10,208,493,450 cycles # 3.077 GHz - 23,347,053,488 instructions # 2.29 insn per cycle - 3.373683802 seconds time elapsed +TOTAL : 3.061657 sec + 10,188,245,124 cycles # 3.074 GHz + 23,248,414,020 instructions # 2.28 insn per cycle + 3.370951944 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266731198158133E-004 Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.927773e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.928672e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.928672e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.938252e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.939166e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.939166e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.518263 sec - 26,413,942,812 cycles # 3.100 GHz - 81,752,027,211 instructions # 3.10 insn per cycle - 8.523020083 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.471636 sec + 25,650,928,170 cycles # 3.027 GHz + 78,935,761,644 instructions # 3.08 insn per cycle + 8.475777896 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.769339e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.773163e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.773163e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.732481e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.735838e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.735838e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.362035 sec - 12,908,972,832 cycles # 2.957 GHz - 39,242,218,426 instructions # 3.04 insn per cycle - 4.367705377 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.403548 sec + 12,924,361,173 cycles # 2.933 GHz + 39,279,334,894 instructions # 3.04 insn per cycle + 4.407811208 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.597196e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.615390e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.615390e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.485088e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.502714e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.502714e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.917124 sec - 5,555,350,612 cycles # 2.892 GHz - 13,788,371,660 instructions # 2.48 insn per cycle - 1.921925818 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +TOTAL : 1.942219 sec + 5,571,920,631 cycles # 2.864 GHz + 13,685,480,241 instructions # 2.46 insn per cycle + 1.946449782 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.768207e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.792227e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.792227e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.737761e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.761950e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.761950e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.688918 sec - 4,898,535,094 cycles # 2.894 GHz - 12,317,895,390 instructions # 2.51 insn per cycle - 1.693783698 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +TOTAL : 1.693500 sec + 4,894,918,115 cycles # 2.884 GHz + 12,340,665,409 instructions # 2.52 insn per cycle + 1.697702233 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.389097e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.403636e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.403636e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.532631e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.547658e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.547658e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.229769 sec - 4,080,389,432 cycles # 1.828 GHz - 6,285,644,175 instructions # 1.54 insn per cycle - 2.234633734 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +TOTAL : 2.186925 sec + 4,105,530,431 cycles # 1.874 GHz + 6,333,977,995 instructions # 1.54 insn per cycle + 2.191453097 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index ca740aa697..de32359ede 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,214 +13,214 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_22:03:28 +DATE: 2024-03-01_03:21:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.220163e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.520037e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.522371e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.198300e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.499375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.501597e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.508013 sec - 2,245,889,851 cycles # 3.028 GHz - 3,557,197,805 instructions # 1.58 insn per cycle - 0.802735999 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +TOTAL : 0.508517 sec + 2,246,531,629 cycles # 3.011 GHz + 3,559,465,442 instructions # 1.58 insn per cycle + 0.806328345 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.741423e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.172289e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.173702e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.741268e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.175443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.176848e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.189427 sec - 10,589,984,166 cycles # 3.075 GHz - 23,506,097,983 instructions # 2.22 insn per cycle - 3.503661605 seconds time elapsed +TOTAL : 3.195111 sec + 10,565,694,760 cycles # 3.061 GHz + 24,272,327,456 instructions # 2.30 insn per cycle + 3.508790742 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266731198158133E-004 Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.908714e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.909599e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.909599e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.950947e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.951893e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.951893e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.601904 sec - 26,416,000,686 cycles # 3.070 GHz - 81,751,768,928 instructions # 3.09 insn per cycle - 8.606661074 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.415718 sec + 25,630,796,247 cycles # 3.044 GHz + 78,935,144,677 instructions # 3.08 insn per cycle + 8.419920398 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.847274e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.850852e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.850852e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.749651e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.752979e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.752979e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.273544 sec - 12,895,840,707 cycles # 3.016 GHz - 39,241,783,042 instructions # 3.04 insn per cycle - 4.278279611 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.383944 sec + 12,941,364,841 cycles # 2.950 GHz + 39,279,009,350 instructions # 3.04 insn per cycle + 4.388336169 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.616490e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.634593e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.634593e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.444820e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.462277e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.462277e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.912479 sec - 5,552,432,995 cycles # 2.897 GHz - 13,787,867,550 instructions # 2.48 insn per cycle - 1.917408982 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +TOTAL : 1.951803 sec + 5,576,482,664 cycles # 2.852 GHz + 13,685,505,947 instructions # 2.45 insn per cycle + 1.956019187 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.750174e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.773348e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.773348e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.751887e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.775334e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.775334e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.691764 sec - 4,896,428,138 cycles # 2.888 GHz - 12,317,937,826 instructions # 2.52 insn per cycle - 1.696849640 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +TOTAL : 1.690955 sec + 4,892,330,509 cycles # 2.888 GHz + 12,340,572,549 instructions # 2.52 insn per cycle + 1.695111197 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.676695e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.690901e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.690901e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.643060e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.657306e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.657306e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.146460 sec - 4,051,401,304 cycles # 1.885 GHz - 6,286,186,428 instructions # 1.55 insn per cycle - 2.151328455 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +TOTAL : 2.155567 sec + 4,105,793,778 cycles # 1.902 GHz + 6,333,858,387 instructions # 1.54 insn per cycle + 2.159935327 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index c1b031e169..836b2fd223 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:12:24 +DATE: 2024-03-01_02:33:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.477440e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.511011e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.513409e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.456815e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489621e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.492178e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.523123 sec - 2,259,536,719 cycles # 2.994 GHz - 3,495,615,515 instructions # 1.55 insn per cycle - 0.827170133 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +TOTAL : 0.523446 sec + 2,259,779,898 cycles # 2.994 GHz + 3,514,783,609 instructions # 1.56 insn per cycle + 0.830655921 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.144882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.182235e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.183656e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.127813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.161921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.163304e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.022410 sec - 10,059,246,665 cycles # 3.067 GHz - 22,982,457,538 instructions # 2.28 insn per cycle - 3.336684992 seconds time elapsed +TOTAL : 3.027147 sec + 10,102,095,677 cycles # 3.066 GHz + 22,774,733,235 instructions # 2.25 insn per cycle + 3.352533111 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266731198158133E-004 Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.895668e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.896543e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.896543e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.968945e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.969930e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.969930e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.662390 sec - 26,452,377,540 cycles # 3.053 GHz - 81,778,558,563 instructions # 3.09 insn per cycle - 8.669649159 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.342362 sec + 25,562,894,530 cycles # 3.064 GHz + 78,707,498,900 instructions # 3.08 insn per cycle + 8.350709191 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.794479e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.797927e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.797927e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.758058e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.761397e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.761397e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.332491 sec - 12,903,575,926 cycles # 2.976 GHz - 39,248,558,231 instructions # 3.04 insn per cycle - 4.346983128 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.374701 sec + 12,919,245,066 cycles # 2.951 GHz + 39,226,355,054 instructions # 3.04 insn per cycle + 4.387657418 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.616252e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.633907e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.633907e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.289947e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.307265e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.307265e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.912847 sec - 5,552,920,900 cycles # 2.896 GHz - 13,804,312,506 instructions # 2.49 insn per cycle - 1.924450522 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11048) (512y: 0) (512z: 0) +TOTAL : 1.987975 sec + 5,629,143,308 cycles # 2.825 GHz + 13,800,788,871 instructions # 2.45 insn per cycle + 1.999251955 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.848997e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.873421e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.873421e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.607973e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.629961e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.629961e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.675121 sec - 4,877,663,558 cycles # 2.905 GHz - 12,329,320,941 instructions # 2.53 insn per cycle - 1.686814675 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9736) (512y: 94) (512z: 0) +TOTAL : 1.716692 sec + 4,942,228,477 cycles # 2.873 GHz + 12,466,581,724 instructions # 2.52 insn per cycle + 1.728222884 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.700547e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.714897e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.714897e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.633414e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.646913e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.646913e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.139777 sec - 4,102,041,327 cycles # 1.913 GHz - 6,293,429,861 instructions # 1.53 insn per cycle - 2.153998058 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 94) (512z: 9019) +TOTAL : 2.159145 sec + 4,117,977,410 cycles # 1.904 GHz + 6,458,802,297 instructions # 1.57 insn per cycle + 2.172057894 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 076951b3cb..5cb26f1dc5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:47:18 +DATE: 2024-03-01_03:05:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.222522e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.247216e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.249235e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.234238e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.262824e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.264818e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.529631 sec - 2,304,451,425 cycles # 3.007 GHz - 3,606,899,880 instructions # 1.57 insn per cycle - 0.823545603 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 +TOTAL : 0.529504 sec + 2,311,611,520 cycles # 3.006 GHz + 3,548,053,349 instructions # 1.53 insn per cycle + 0.826491750 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.770931e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.799512e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.800677e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.771596e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.800183e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.801376e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.294556 sec - 10,953,172,236 cycles # 3.084 GHz - 24,077,449,681 instructions # 2.20 insn per cycle - 3.611746200 seconds time elapsed +TOTAL : 3.298192 sec + 10,832,117,508 cycles # 3.051 GHz + 23,123,371,744 instructions # 2.13 insn per cycle + 3.609870208 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266731198158122E-004 Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.479098e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.479597e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.479597e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.437828e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.438319e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.438319e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 36.629078 sec - 112,948,926,771 cycles # 3.084 GHz - 141,510,082,785 instructions # 1.25 insn per cycle - 36.633991445 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21365) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 36.966049 sec + 113,615,073,618 cycles # 3.074 GHz + 144,968,095,911 instructions # 1.28 insn per cycle + 36.970400514 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21301) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140461E-004 -Relative difference = 2.8372991790910424e-07 +Avg ME (F77/C++) = 6.6266731198140450E-004 +Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.310376e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.313138e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.313138e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.281454e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.284254e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.284254e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.964302 sec - 14,929,585,775 cycles # 3.005 GHz - 37,533,096,829 instructions # 2.51 insn per cycle - 4.969441025 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.007790 sec + 14,730,075,423 cycles # 2.939 GHz + 37,574,123,368 instructions # 2.55 insn per cycle + 5.012256986 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141220E-004 -Relative difference = 2.837299064562788e-07 +Avg ME (F77/C++) = 6.6266731198141209E-004 +Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.873423e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.888108e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.888108e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.743950e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.758262e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.758262e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.092961 sec - 6,035,782,399 cycles # 2.879 GHz - 12,947,327,941 instructions # 2.15 insn per cycle - 2.097818635 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46593) (512y: 0) (512z: 0) +TOTAL : 2.127650 sec + 6,163,100,705 cycles # 2.892 GHz + 13,061,449,928 instructions # 2.12 insn per cycle + 2.132187716 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.606441e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.629322e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.629322e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.460039e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.482215e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.482215e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.717293 sec - 4,994,518,363 cycles # 2.902 GHz - 11,363,079,717 instructions # 2.28 insn per cycle - 1.722220234 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40158) (512y: 279) (512z: 0) +TOTAL : 1.743142 sec + 5,059,957,423 cycles # 2.897 GHz + 11,440,000,239 instructions # 2.26 insn per cycle + 1.747501406 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.116630e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.132912e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.132912e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.938377e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.953416e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.953416e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.031067 sec - 3,895,544,556 cycles # 1.915 GHz - 5,853,606,324 instructions # 1.50 insn per cycle - 2.035959095 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2112) (512y: 142) (512z:39211) +TOTAL : 2.075865 sec + 3,979,244,183 cycles # 1.914 GHz + 5,942,139,795 instructions # 1.49 insn per cycle + 2.080305520 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 24a5052bbf..afca4b7953 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:48:26 +DATE: 2024-03-01_03:06:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.253640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.278160e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.280144e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.244633e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.273686e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.275983e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.527924 sec - 2,278,775,837 cycles # 3.017 GHz - 3,521,445,566 instructions # 1.55 insn per cycle - 0.812423009 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 +TOTAL : 0.531287 sec + 2,311,991,159 cycles # 3.015 GHz + 3,584,221,599 instructions # 1.55 insn per cycle + 0.825938734 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.792881e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.821756e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.822944e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.793538e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.821908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.823116e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.270966 sec - 10,872,573,045 cycles # 3.074 GHz - 24,270,572,338 instructions # 2.23 insn per cycle - 3.593854665 seconds time elapsed +TOTAL : 3.269849 sec + 10,805,743,512 cycles # 3.068 GHz + 25,084,175,459 instructions # 2.32 insn per cycle + 3.579404730 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266731198158122E-004 Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.439063e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.439545e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.439545e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.412070e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.412546e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.412546e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 36.958546 sec - 113,902,158,027 cycles # 3.082 GHz - 141,695,308,963 instructions # 1.24 insn per cycle - 36.963459772 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21615) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 37.253529 sec + 114,121,742,420 cycles # 3.069 GHz + 145,689,073,244 instructions # 1.28 insn per cycle + 37.257693750 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:22559) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140461E-004 -Relative difference = 2.8372991790910424e-07 +Avg ME (F77/C++) = 6.6266731198140450E-004 +Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.314783e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.317486e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.317486e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.198627e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.201180e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.201180e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.957427 sec - 14,881,560,951 cycles # 3.000 GHz - 37,592,648,542 instructions # 2.53 insn per cycle - 4.962388959 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.136766 sec + 15,152,451,249 cycles # 2.948 GHz + 37,761,291,325 instructions # 2.49 insn per cycle + 5.141156615 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141220E-004 -Relative difference = 2.837299064562788e-07 +Avg ME (F77/C++) = 6.6266731198141209E-004 +Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.059572e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.075102e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.075102e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.950126e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.965335e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.965335e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.044832 sec - 5,940,592,925 cycles # 2.900 GHz - 12,831,574,033 instructions # 2.16 insn per cycle - 2.049914401 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45663) (512y: 0) (512z: 0) +TOTAL : 2.072422 sec + 6,013,210,013 cycles # 2.896 GHz + 12,895,807,400 instructions # 2.14 insn per cycle + 2.076740513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.581576e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.604096e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.604096e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.394633e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.416357e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.416357e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.721363 sec - 4,986,832,806 cycles # 2.890 GHz - 11,359,238,555 instructions # 2.28 insn per cycle - 1.726371578 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:39855) (512y: 212) (512z: 0) +TOTAL : 1.755119 sec + 5,091,337,522 cycles # 2.895 GHz + 11,446,622,503 instructions # 2.25 insn per cycle + 1.759562583 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.082101e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.098338e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.098338e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.001850e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.017431e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.017431e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.039340 sec - 3,890,542,782 cycles # 1.904 GHz - 5,843,067,877 instructions # 1.50 insn per cycle - 2.044270183 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1687) (512y: 116) (512z:38946) +TOTAL : 2.059473 sec + 3,944,538,203 cycles # 1.912 GHz + 5,896,184,476 instructions # 1.49 insn per cycle + 2.063940696 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 2a26f6c49e..082176c355 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:13:01 +DATE: 2024-03-01_02:33:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.334488e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.395305e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.401279e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.331619e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.392833e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.401451e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.479871 sec - 2,082,107,480 cycles # 2.987 GHz - 3,068,311,706 instructions # 1.47 insn per cycle - 0.776340123 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +TOTAL : 0.481440 sec + 2,077,514,231 cycles # 2.979 GHz + 3,093,505,744 instructions # 1.49 insn per cycle + 0.777796663 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.553737e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.638533e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.641807e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.622317e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.697439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.700567e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.723684 sec - 5,920,743,472 cycles # 3.029 GHz - 12,600,426,909 instructions # 2.13 insn per cycle - 2.014393536 seconds time elapsed +TOTAL : 1.713365 sec + 5,944,272,538 cycles # 3.053 GHz + 12,632,277,461 instructions # 2.13 insn per cycle + 2.004079656 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626454e-04 Avg ME (F77/CUDA) = 6.6262659968156085E-004 Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.101564e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.102651e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.102651e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.049682e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.050694e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.050694e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.813063 sec - 24,199,832,283 cycles # 3.096 GHz - 75,878,496,374 instructions # 3.14 insn per cycle - 7.820369454 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.010109 sec + 24,614,432,061 cycles # 3.072 GHz + 78,126,558,251 instructions # 3.17 insn per cycle + 8.016891762 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.355480e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.368368e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.368368e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.386833e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.400650e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.400650e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.239129 sec - 6,484,377,988 cycles # 2.892 GHz - 20,115,449,226 instructions # 3.10 insn per cycle - 2.250294450 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.228676 sec + 6,461,822,382 cycles # 2.894 GHz + 20,120,855,558 instructions # 3.11 insn per cycle + 2.241648353 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.698731e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.705466e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.705466e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.671811e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.678370e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.678370e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.974985 sec - 2,819,368,645 cycles # 2.882 GHz - 7,038,056,878 instructions # 2.50 insn per cycle - 0.989199229 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +TOTAL : 0.990019 sec + 2,821,251,649 cycles # 2.839 GHz + 6,989,221,748 instructions # 2.48 insn per cycle + 1.002444816 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.942632e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.951445e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.951445e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.922237e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.931217e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.931217e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.852688 sec - 2,476,830,184 cycles # 2.890 GHz - 6,280,101,874 instructions # 2.54 insn per cycle - 0.869178735 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +TOTAL : 0.861179 sec + 2,488,986,957 cycles # 2.876 GHz + 6,296,476,670 instructions # 2.53 insn per cycle + 0.887481911 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.560069e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565955e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565955e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.534197e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.539839e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.539839e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.059815 sec - 2,036,227,562 cycles # 1.913 GHz - 3,248,407,876 instructions # 1.60 insn per cycle - 1.071376552 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +TOTAL : 1.078476 sec + 2,048,809,794 cycles # 1.894 GHz + 3,266,667,713 instructions # 1.59 insn per cycle + 1.091634951 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index bbf3fbe6ee..6f564b583c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:58:57 +DATE: 2024-03-01_03:17:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.679886e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.329455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.329455e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.665443e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.315182e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.315182e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.467337 sec - 2,052,749,916 cycles # 2.990 GHz - 3,071,904,430 instructions # 1.50 insn per cycle - 0.745966161 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +TOTAL : 0.468201 sec + 2,060,292,715 cycles # 2.983 GHz + 3,094,906,819 instructions # 1.50 insn per cycle + 0.750075013 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,169 +72,169 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.238688e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.463206e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.463206e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.249943e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.466015e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.466015e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.898849 sec - 6,460,514,384 cycles # 3.024 GHz - 13,711,212,771 instructions # 2.12 insn per cycle - 2.194001425 seconds time elapsed +TOTAL : 1.882218 sec + 6,478,461,444 cycles # 3.059 GHz + 12,879,929,349 instructions # 1.99 insn per cycle + 2.174649918 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626454e-04 Avg ME (F77/CUDA) = 6.6262659968156085E-004 Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.088417e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.089483e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.089483e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.041429e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.042536e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.042536e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.863905 sec - 24,222,682,650 cycles # 3.080 GHz - 75,881,880,568 instructions # 3.13 insn per cycle - 7.868677803 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.044775 sec + 24,623,818,516 cycles # 3.060 GHz + 78,132,484,739 instructions # 3.17 insn per cycle + 8.049291657 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.610137e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.625294e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.625294e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.498892e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.513186e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.513186e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.165952 sec - 6,500,582,172 cycles # 2.998 GHz - 20,124,386,165 instructions # 3.10 insn per cycle - 2.170723762 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.197009 sec + 6,464,288,620 cycles # 2.938 GHz + 20,129,426,624 instructions # 3.11 insn per cycle + 2.201352169 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.710509e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.717516e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.717516e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.703352e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.711063e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.711063e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.969667 sec - 2,825,954,460 cycles # 2.903 GHz - 7,046,906,721 instructions # 2.49 insn per cycle - 0.974421373 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +TOTAL : 0.973161 sec + 2,827,392,405 cycles # 2.894 GHz + 6,998,075,079 instructions # 2.48 insn per cycle + 0.977561277 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.939035e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.948865e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.948865e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.931885e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.940835e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.940835e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.856524 sec - 2,486,142,670 cycles # 2.890 GHz - 6,288,929,352 instructions # 2.53 insn per cycle - 0.861361903 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +TOTAL : 0.859317 sec + 2,491,742,914 cycles # 2.887 GHz + 6,305,390,293 instructions # 2.53 insn per cycle + 0.863665296 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.549562e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.555377e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.555377e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.551095e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.557002e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.557002e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.069429 sec - 2,044,889,978 cycles # 1.905 GHz - 3,257,809,276 instructions # 1.59 insn per cycle - 1.074199449 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +TOTAL : 1.067932 sec + 2,057,227,059 cycles # 1.920 GHz + 3,276,345,738 instructions # 1.59 insn per cycle + 1.072312021 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 881037651a..66226e8d59 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_22:10:40 +DATE: 2024-03-01_03:28:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.337925e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.389812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.395257e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.308056e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.358553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.363626e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.462308 sec - 2,034,668,664 cycles # 3.023 GHz - 3,018,996,605 instructions # 1.48 insn per cycle - 0.730931017 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common +TOTAL : 0.461299 sec + 2,006,885,691 cycles # 2.992 GHz + 3,022,532,155 instructions # 1.51 insn per cycle + 0.728549346 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.577816e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.651460e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.654751e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.572531e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.646089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.649338e+05 ) sec^-1 MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.794466 sec - 6,241,889,186 cycles # 3.071 GHz - 13,099,547,835 instructions # 2.10 insn per cycle - 2.089963432 seconds time elapsed +TOTAL : 1.795584 sec + 6,148,728,410 cycles # 3.042 GHz + 12,326,233,623 instructions # 2.00 insn per cycle + 2.078967785 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626454e-04 Avg ME (F77/CUDA) = 6.6262659968156085E-004 Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.114831e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.115954e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.115954e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 7.764258 sec - 24,215,823,633 cycles # 3.118 GHz - 75,876,946,025 instructions # 3.13 insn per cycle - 7.768790583 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.053824e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.054841e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.054841e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 7.994149 sec + 24,620,138,866 cycles # 3.079 GHz + 78,125,377,108 instructions # 3.17 insn per cycle + 7.998228624 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.487445e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.500995e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.500995e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 2.199553 sec - 6,495,792,958 cycles # 2.953 GHz - 20,114,445,952 instructions # 3.10 insn per cycle - 2.204072119 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.346279e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.360483e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.360483e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 +TOTAL : 2.242069 sec + 6,461,640,731 cycles # 2.878 GHz + 20,121,052,869 instructions # 3.11 insn per cycle + 2.246196034 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.699836e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.707208e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.707208e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 -TOTAL : 0.974730 sec - 2,823,184,438 cycles # 2.886 GHz - 7,036,774,400 instructions # 2.49 insn per cycle - 0.979278056 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.685316e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.692321e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.692321e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.982986 sec + 2,822,415,829 cycles # 2.862 GHz + 6,987,486,660 instructions # 2.48 insn per cycle + 0.987025186 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.944843e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.953999e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.953999e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 -TOTAL : 0.852312 sec - 2,479,942,344 cycles # 2.897 GHz - 6,275,567,381 instructions # 2.53 insn per cycle - 0.856933859 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.936405e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.945906e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.945906e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.855808 sec + 2,484,894,865 cycles # 2.892 GHz + 6,291,816,709 instructions # 2.53 insn per cycle + 0.859867773 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.563188e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.569271e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.569271e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.547512e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.553394e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.553394e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.058689 sec - 2,038,090,504 cycles # 1.919 GHz - 3,244,115,296 instructions # 1.59 insn per cycle - 1.063236251 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +TOTAL : 1.069890 sec + 2,051,026,977 cycles # 1.912 GHz + 3,263,937,559 instructions # 1.59 insn per cycle + 1.073863100 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 6fdba85723..e810053300 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_22:07:20 +DATE: 2024-03-01_03:25:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.355863e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.408384e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.413572e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.337764e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.388253e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.393743e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.459733 sec - 2,017,638,225 cycles # 3.004 GHz - 3,053,099,345 instructions # 1.51 insn per cycle - 0.729037153 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst +TOTAL : 0.460965 sec + 2,014,485,763 cycles # 3.003 GHz + 3,009,625,577 instructions # 1.49 insn per cycle + 0.728425666 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.567086e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.640820e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.644053e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.558734e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.632343e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.635567e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.746231 sec - 6,038,476,497 cycles # 3.062 GHz - 12,908,245,229 instructions # 2.14 insn per cycle - 2.029486292 seconds time elapsed +TOTAL : 1.743753 sec + 6,041,672,737 cycles # 3.067 GHz + 12,221,124,809 instructions # 2.02 insn per cycle + 2.027112098 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626454e-04 Avg ME (F77/CUDA) = 6.6262659968156085E-004 Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.086272e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.087317e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.087317e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.040104e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.041097e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.041097e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.869325 sec - 24,205,277,736 cycles # 3.075 GHz - 75,876,375,845 instructions # 3.13 insn per cycle - 7.874126058 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.046597 sec + 24,613,022,395 cycles # 3.060 GHz + 78,130,326,722 instructions # 3.17 insn per cycle + 8.050808561 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.604059e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.618615e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.618615e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.468090e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.482424e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.482424e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.165288 sec - 6,496,211,625 cycles # 2.995 GHz - 20,114,277,451 instructions # 3.10 insn per cycle - 2.170165986 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.203809 sec + 6,456,229,713 cycles # 2.925 GHz + 20,119,923,968 instructions # 3.12 insn per cycle + 2.207913022 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.714168e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.721433e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.721433e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.712278e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.719631e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.719631e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.964960 sec - 2,815,359,642 cycles # 2.907 GHz - 7,036,914,271 instructions # 2.50 insn per cycle - 0.969520380 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +TOTAL : 0.965705 sec + 2,817,996,939 cycles # 2.908 GHz + 6,988,025,639 instructions # 2.48 insn per cycle + 0.969794950 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.951631e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.961047e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.961047e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.924856e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934354e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934354e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.848346 sec - 2,475,916,860 cycles # 2.906 GHz - 6,279,126,359 instructions # 2.54 insn per cycle - 0.852826821 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +TOTAL : 0.859873 sec + 2,483,822,785 cycles # 2.877 GHz + 6,295,526,273 instructions # 2.53 insn per cycle + 0.863979329 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.553949e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.559900e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.559900e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.552387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558368e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558368e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.063833 sec - 2,035,247,459 cycles # 1.907 GHz - 3,247,417,227 instructions # 1.60 insn per cycle - 1.068423622 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +TOTAL : 1.064425 sec + 2,047,040,960 cycles # 1.917 GHz + 3,265,583,381 instructions # 1.60 insn per cycle + 1.068371519 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 5ba28b310e..29def3747b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,214 +13,214 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_22:04:05 +DATE: 2024-03-01_03:22:13 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.740544e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.404954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.410693e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.727516e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.381665e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.387640e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.466537 sec - 1,999,385,948 cycles # 2.951 GHz - 3,067,244,681 instructions # 1.53 insn per cycle - 0.734843083 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +TOTAL : 0.463988 sec + 2,009,660,419 cycles # 2.987 GHz + 3,043,780,102 instructions # 1.51 insn per cycle + 0.732052318 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.501615e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.641152e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.644324e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.463642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.641012e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.644220e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.819126 sec - 6,263,393,299 cycles # 3.061 GHz - 13,206,274,346 instructions # 2.11 insn per cycle - 2.110646534 seconds time elapsed +TOTAL : 1.829361 sec + 6,179,090,687 cycles # 3.005 GHz + 13,497,023,724 instructions # 2.18 insn per cycle + 2.119489112 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626454e-04 Avg ME (F77/CUDA) = 6.6262659968156085E-004 Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.077876e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.078899e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.078899e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.033662e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.034665e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.034665e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.901308 sec - 24,218,143,686 cycles # 3.064 GHz - 75,878,622,530 instructions # 3.13 insn per cycle - 7.905943320 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.072340 sec + 24,646,233,583 cycles # 3.055 GHz + 78,130,465,005 instructions # 3.17 insn per cycle + 8.076398723 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.608896e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.623255e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.623255e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.437406e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.451013e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.451013e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.163733 sec - 6,487,922,446 cycles # 2.994 GHz - 20,114,268,161 instructions # 3.10 insn per cycle - 2.168256412 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.213064 sec + 6,463,144,308 cycles # 2.916 GHz + 20,121,040,605 instructions # 3.11 insn per cycle + 2.217197026 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.707369e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.714719e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.714719e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.690865e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.698060e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.698060e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.969111 sec - 2,835,385,661 cycles # 2.915 GHz - 7,037,071,150 instructions # 2.48 insn per cycle - 0.973627782 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +TOTAL : 0.977816 sec + 2,816,932,981 cycles # 2.871 GHz + 6,987,870,279 instructions # 2.48 insn per cycle + 0.981891147 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.796935e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.805643e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.805643e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.925443e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934689e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934689e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.921646 sec - 2,479,592,241 cycles # 2.682 GHz - 6,279,594,335 instructions # 2.53 insn per cycle - 0.926599340 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +TOTAL : 0.859893 sec + 2,483,713,955 cycles # 2.877 GHz + 6,295,351,555 instructions # 2.53 insn per cycle + 0.863911879 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.571973e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.577972e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.577972e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.552325e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558086e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558086e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.051221 sec - 2,033,761,132 cycles # 1.928 GHz - 3,247,291,972 instructions # 1.60 insn per cycle - 1.055684386 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +TOTAL : 1.064299 sec + 2,046,605,748 cycles # 1.917 GHz + 3,265,707,472 instructions # 1.60 insn per cycle + 1.068273671 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 38a19c6467..50b444080d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:13:31 +DATE: 2024-03-01_02:34:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.299974e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.358481e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.364460e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.321381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.374979e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.380502e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.480230 sec - 2,110,623,004 cycles # 3.014 GHz - 3,051,762,313 instructions # 1.45 insn per cycle - 0.785636939 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +TOTAL : 0.482038 sec + 2,083,496,491 cycles # 2.987 GHz + 3,090,021,729 instructions # 1.48 insn per cycle + 0.780369869 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.553894e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.626828e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.630131e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.505248e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.577137e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.580211e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.720072 sec - 5,949,004,472 cycles # 3.045 GHz - 11,591,413,648 instructions # 1.95 insn per cycle - 2.011142873 seconds time elapsed +TOTAL : 1.719742 sec + 5,952,430,615 cycles # 3.047 GHz + 11,750,571,480 instructions # 1.97 insn per cycle + 2.009992190 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626454e-04 Avg ME (F77/CUDA) = 6.6262659968156085E-004 Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.101524e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.102627e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.102627e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.039243e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.040268e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.040268e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.813552 sec - 24,191,703,837 cycles # 3.095 GHz - 75,800,583,581 instructions # 3.13 insn per cycle - 7.820528848 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.050624 sec + 24,577,706,132 cycles # 3.054 GHz + 77,857,469,800 instructions # 3.17 insn per cycle + 8.057072902 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870430095556E-004 -Relative difference = 6.489572191632735e-09 +Avg ME (F77/C++) = 6.6274866268634797E-004 +Relative difference = 5.630135835748959e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.533729e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.548052e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.548052e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.236562e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.248995e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.248995e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.185486 sec - 6,493,193,059 cycles # 2.965 GHz - 20,110,924,734 instructions # 3.10 insn per cycle - 2.248042621 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.274363 sec + 6,415,212,085 cycles # 2.816 GHz + 20,086,390,532 instructions # 3.13 insn per cycle + 2.288238797 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861465384638E-004 +Relative difference = 2.211071647257023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.715181e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.722216e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.722216e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.636656e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.643300e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.643300e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.964967 sec - 2,812,074,979 cycles # 2.903 GHz - 7,037,571,648 instructions # 2.50 insn per cycle - 0.980644370 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11587) (512y: 0) (512z: 0) +TOTAL : 1.010969 sec + 2,918,129,602 cycles # 2.878 GHz + 7,130,827,098 instructions # 2.44 insn per cycle + 1.024648825 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271939668077068E-004 +Relative difference = 5.008498817890231e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.945182e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.954199e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.954199e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.848024e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.856123e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.856123e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.851822 sec - 2,474,162,029 cycles # 2.891 GHz - 6,280,078,187 instructions # 2.54 insn per cycle - 0.864965859 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10302) (512y: 50) (512z: 0) +TOTAL : 0.895519 sec + 2,583,274,132 cycles # 2.873 GHz + 6,439,451,842 instructions # 2.49 insn per cycle + 0.910176239 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271939668077068E-004 +Relative difference = 5.008498817890231e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.559682e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565547e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565547e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.488982e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.494377e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.494377e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.059915 sec - 2,038,653,616 cycles # 1.915 GHz - 3,247,299,383 instructions # 1.59 insn per cycle - 1.075802468 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2140) (512y: 48) (512z: 9219) +TOTAL : 1.109477 sec + 2,120,739,457 cycles # 1.905 GHz + 3,428,489,642 instructions # 1.62 insn per cycle + 1.120804955 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2911) (512y: 22) (512z: 9647) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 +Avg ME (F77/C++) = 6.6271952032322112E-004 +Relative difference = 3.066639970473621e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 5ecd50d8da..3e610d68fd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:49:34 +DATE: 2024-03-01_03:07:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.571051e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.616606e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.620923e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.548079e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.594396e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.599390e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.488255 sec - 2,117,936,620 cycles # 2.989 GHz - 3,170,007,418 instructions # 1.50 insn per cycle - 0.770960722 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 +TOTAL : 0.487762 sec + 2,117,397,644 cycles # 2.979 GHz + 3,170,491,357 instructions # 1.50 insn per cycle + 0.771619877 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.724432e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.783959e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.786528e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.728616e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.789567e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.792128e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.852214 sec - 6,378,559,976 cycles # 3.053 GHz - 13,824,910,875 instructions # 2.17 insn per cycle - 2.145918086 seconds time elapsed +TOTAL : 1.852993 sec + 6,403,206,858 cycles # 3.066 GHz + 13,984,822,985 instructions # 2.18 insn per cycle + 2.145838793 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626454e-04 Avg ME (F77/CUDA) = 6.6262660579844562E-004 Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.881511e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.882353e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.882353e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 27.892796 sec - 85,879,383,305 cycles # 3.079 GHz - 133,985,180,489 instructions # 1.56 insn per cycle - 27.897590626 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16123) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.747654e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.748466e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.748466e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 28.541681 sec + 87,683,123,741 cycles # 3.072 GHz + 135,626,627,328 instructions # 1.55 insn per cycle + 28.545959109 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275354356437610E-004 -Relative difference = 6.573239683366044e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627534e-04 +Avg ME (F77/C++) = 6.6275340277317796E-004 +Relative difference = 4.184328521943034e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.083048e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.096099e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.096099e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 -TOTAL : 2.328040 sec - 6,719,869,961 cycles # 2.885 GHz - 19,163,430,307 instructions # 2.85 insn per cycle - 2.333347546 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.148984e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.161699e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.161699e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.302428 sec + 6,776,067,855 cycles # 2.939 GHz + 19,386,467,667 instructions # 2.86 insn per cycle + 2.306810458 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274859783433532E-004 -Relative difference = 3.2677016209485094e-09 +Avg ME (F77/C++) = 6.6274862707273868E-004 +Relative difference = 4.0849182767952624e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.523354e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.528989e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.528989e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.506728e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.512574e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.512574e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.085366 sec - 3,139,497,739 cycles # 2.883 GHz - 6,746,894,389 instructions # 2.15 insn per cycle - 1.090164339 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48625) (512y: 0) (512z: 0) +TOTAL : 1.096393 sec + 3,175,310,502 cycles # 2.890 GHz + 6,807,675,147 instructions # 2.14 insn per cycle + 1.100557110 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724143469353E-004 -Relative difference = 6.252149235286529e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731558747466E-004 +Relative difference = 2.3520194007978538e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.851486e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.859999e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.859999e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.815661e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.823746e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.823746e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.894209 sec - 2,606,414,605 cycles # 2.903 GHz - 5,930,989,921 instructions # 2.28 insn per cycle - 0.899025453 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42219) (512y: 24) (512z: 0) +TOTAL : 0.911313 sec + 2,641,911,907 cycles # 2.888 GHz + 5,985,989,672 instructions # 2.27 insn per cycle + 0.915610697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724143469353E-004 -Relative difference = 6.252149235286529e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731558747466E-004 +Relative difference = 2.3520194007978538e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.546446e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.552539e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.552539e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.069126 sec - 2,048,063,476 cycles # 1.909 GHz - 3,435,686,018 instructions # 1.68 insn per cycle - 1.073902025 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4188) (512y: 9) (512z:44489) +EvtsPerSec[Rmb+ME] (23) = ( 1.523255e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.528884e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.528884e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.084772 sec + 2,074,111,548 cycles # 1.906 GHz + 3,500,542,355 instructions # 1.69 insn per cycle + 1.089027435 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5197) (512y: 3) (512z:44822) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272748295826550E-004 -Relative difference = 2.5714542480216212e-08 +Avg ME (F77/C++) = 6.6272750363879224E-004 +Relative difference = 5.490631193034436e-09 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index b0f7a52d59..f668536073 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:50:25 +DATE: 2024-03-01_03:08:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.540170e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.582001e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.586495e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.541557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.588429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.593399e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.484380 sec - 2,126,942,389 cycles # 3.017 GHz - 3,183,788,929 instructions # 1.50 insn per cycle - 0.765419001 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 +TOTAL : 0.485011 sec + 2,123,544,393 cycles # 3.007 GHz + 3,219,525,664 instructions # 1.52 insn per cycle + 0.766064420 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.694061e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.754237e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.756851e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.637487e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.696462e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.698981e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.856995 sec - 6,389,024,738 cycles # 3.056 GHz - 13,438,339,449 instructions # 2.10 insn per cycle - 2.150054585 seconds time elapsed +TOTAL : 1.858325 sec + 6,401,876,626 cycles # 3.056 GHz + 13,834,352,039 instructions # 2.16 insn per cycle + 2.151127842 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626454e-04 Avg ME (F77/CUDA) = 6.6262660579844562E-004 Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.892751e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.893597e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.893597e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 27.839605 sec - 85,632,537,332 cycles # 3.076 GHz - 134,110,079,741 instructions # 1.57 insn per cycle - 27.844551025 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16109) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.762616e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.763465e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.763465e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 28.469746 sec + 87,566,965,728 cycles # 3.076 GHz + 135,909,521,186 instructions # 1.55 insn per cycle + 28.473960910 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15910) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627536e-04 -Avg ME (F77/C++) = 6.6275357377482830E-004 -Relative difference = 3.95700176737784e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275352674967369E-004 +Relative difference = 4.0361421941458736e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.422115e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.436822e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.436822e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 -TOTAL : 2.217876 sec - 6,734,489,864 cycles # 3.031 GHz - 19,223,110,429 instructions # 2.85 insn per cycle - 2.222784192 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.141246e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.153468e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.153468e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.304055 sec + 6,854,008,563 cycles # 2.972 GHz + 19,438,508,034 instructions # 2.84 insn per cycle + 2.308246423 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274859765498573E-004 -Relative difference = 3.538316437387639e-09 +Avg ME (F77/C++) = 6.6274862764021530E-004 +Relative difference = 4.170542995014107e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.435689e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.441032e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.441032e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.543089e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.548736e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.548736e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.151356 sec - 3,077,944,189 cycles # 2.664 GHz - 6,686,500,879 instructions # 2.17 insn per cycle - 1.156273729 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47416) (512y: 0) (512z: 0) +TOTAL : 1.070827 sec + 3,111,432,280 cycles # 2.896 GHz + 6,718,585,544 instructions # 2.16 insn per cycle + 1.075017514 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724133897148E-004 -Relative difference = 6.237705578619894e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731651051409E-004 +Relative difference = 2.4912983202981302e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.696521e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.704211e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.704211e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.837542e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.845711e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.845711e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.975502 sec - 2,608,790,145 cycles # 2.663 GHz - 5,935,625,436 instructions # 2.28 insn per cycle - 0.980501381 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41564) (512y: 18) (512z: 0) +TOTAL : 0.900474 sec + 2,630,752,588 cycles # 2.910 GHz + 5,969,340,561 instructions # 2.27 insn per cycle + 0.904647261 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724133897148E-004 -Relative difference = 6.237705578619894e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731651051409E-004 +Relative difference = 2.4912983202981302e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.371523e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.376383e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.376383e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.204978 sec - 2,046,707,084 cycles # 1.693 GHz - 3,423,438,147 instructions # 1.67 insn per cycle - 1.210209313 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3375) (512y: 11) (512z:43966) +EvtsPerSec[Rmb+ME] (23) = ( 1.526039e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.531935e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.531935e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.083027 sec + 2,083,719,160 cycles # 1.918 GHz + 3,494,111,175 instructions # 1.68 insn per cycle + 1.087325959 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4161) (512y: 4) (512z:44465) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272749650985591E-004 -Relative difference = 5.26633351741962e-09 +Avg ME (F77/C++) = 6.6272750384530066E-004 +Relative difference = 5.80223501432476e-09 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 5e76674d00..8553820a52 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:14:00 +DATE: 2024-03-01_02:34:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.465947e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.494792e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.497225e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.473478e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.502235e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.504525e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.524953 sec - 2,256,489,010 cycles # 2.939 GHz - 3,447,539,579 instructions # 1.53 insn per cycle - 0.836200977 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +TOTAL : 0.522907 sec + 2,248,416,129 cycles # 2.981 GHz + 3,483,881,112 instructions # 1.55 insn per cycle + 0.829467781 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.125742e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.159824e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.161295e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.123898e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.157734e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.159130e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.034549 sec - 10,082,494,977 cycles # 3.063 GHz - 22,355,930,391 instructions # 2.22 insn per cycle - 3.348702480 seconds time elapsed +TOTAL : 3.035491 sec + 10,039,386,860 cycles # 3.052 GHz + 22,522,898,713 instructions # 2.24 insn per cycle + 3.349083086 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.884772e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.885701e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.885701e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.952639e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953615e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953615e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.711657 sec - 26,806,493,916 cycles # 3.076 GHz - 82,459,435,426 instructions # 3.08 insn per cycle - 8.718797714 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.409354 sec + 25,927,870,734 cycles # 3.082 GHz + 79,436,480,305 instructions # 3.06 insn per cycle + 8.416137774 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.779319e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.782699e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.782699e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.739028e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742372e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742372e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.349718 sec - 12,635,010,271 cycles # 2.902 GHz - 38,537,183,996 instructions # 3.05 insn per cycle - 4.365144686 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.395641 sec + 12,641,926,900 cycles # 2.873 GHz + 38,549,360,435 instructions # 3.05 insn per cycle + 4.411574958 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.648503e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.666677e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.666677e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.720558e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.737987e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.737987e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.906583 sec - 5,531,649,242 cycles # 2.895 GHz - 13,584,392,935 instructions # 2.46 insn per cycle - 1.919917908 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10944) (512y: 0) (512z: 0) +TOTAL : 1.889905 sec + 5,503,418,397 cycles # 2.905 GHz + 13,481,227,468 instructions # 2.45 insn per cycle + 1.901949052 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.700993e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.723114e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.723114e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.817789e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.841302e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.841302e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.700317 sec - 4,941,121,910 cycles # 2.899 GHz - 12,109,565,477 instructions # 2.45 insn per cycle - 1.711862121 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9682) (512y: 76) (512z: 0) +TOTAL : 1.679659 sec + 4,858,057,374 cycles # 2.885 GHz + 12,135,455,571 instructions # 2.50 insn per cycle + 1.694768152 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.716895e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.731839e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.731839e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.171224e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.183880e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.183880e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.134730 sec - 4,095,106,977 cycles # 1.915 GHz - 6,282,362,404 instructions # 1.53 insn per cycle - 2.148505489 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1528) (512y: 76) (512z: 9010) +TOTAL : 2.297248 sec + 4,143,595,621 cycles # 1.801 GHz + 6,336,694,490 instructions # 1.53 insn per cycle + 2.312628428 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 562f5e1d4c..44d560fb63 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-02-05_21:14:37 +DATE: 2024-03-01_02:35:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.477704e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.507523e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.510116e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.474402e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.502829e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505143e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.527379 sec - 2,175,766,055 cycles # 2.841 GHz - 3,418,769,839 instructions # 1.57 insn per cycle - 0.834947605 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +TOTAL : 0.522485 sec + 2,266,664,443 cycles # 3.011 GHz + 3,552,942,464 instructions # 1.57 insn per cycle + 0.824080628 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.125116e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.158909e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.160233e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.147340e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181695e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.182993e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.022032 sec - 10,121,111,635 cycles # 3.087 GHz - 22,821,428,883 instructions # 2.25 insn per cycle - 3.334908174 seconds time elapsed +TOTAL : 3.023944 sec + 10,029,910,184 cycles # 3.059 GHz + 21,497,951,661 instructions # 2.14 insn per cycle + 3.338904131 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.883492e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.884333e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.884333e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.924823e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.925747e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925747e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.718497 sec - 26,807,367,386 cycles # 3.074 GHz - 82,358,991,278 instructions # 3.07 insn per cycle - 8.725625839 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.531114 sec + 25,939,606,781 cycles # 3.040 GHz + 79,447,311,630 instructions # 3.06 insn per cycle + 8.537643841 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.755460e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.758746e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.758746e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.758654e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.761985e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.761985e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.376440 sec - 12,657,723,358 cycles # 2.890 GHz - 38,556,519,238 instructions # 3.05 insn per cycle - 4.389904658 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.372440 sec + 12,693,692,693 cycles # 2.901 GHz + 38,521,475,204 instructions # 3.03 insn per cycle + 4.385193423 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.672799e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.691336e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.691336e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.635318e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.652109e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.652109e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.901504 sec - 5,498,608,733 cycles # 2.886 GHz - 13,598,345,039 instructions # 2.47 insn per cycle - 1.913319846 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) +TOTAL : 1.908191 sec + 5,531,901,200 cycles # 2.893 GHz + 13,605,961,475 instructions # 2.46 insn per cycle + 1.920337987 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.786330e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.809305e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.809305e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.704499e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.725961e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.725961e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.685546 sec - 4,831,483,004 cycles # 2.859 GHz - 12,121,611,558 instructions # 2.51 insn per cycle - 1.697656178 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9659) (512y: 76) (512z: 0) +TOTAL : 1.699452 sec + 4,910,284,170 cycles # 2.883 GHz + 12,271,024,564 instructions # 2.50 insn per cycle + 1.712563313 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.724730e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.739490e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.739490e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.567240e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.580886e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.580886e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.136169 sec - 4,091,747,321 cycles # 1.914 GHz - 6,289,952,093 instructions # 1.54 insn per cycle - 2.148137472 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1508) (512y: 76) (512z: 9009) +TOTAL : 2.177959 sec + 4,164,411,217 cycles # 1.910 GHz + 6,442,301,345 instructions # 1.55 insn per cycle + 2.190574077 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 7d3ee494d3..93119c7539 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-05_21:17:01 +DATE: 2024-03-01_02:37:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.060831e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.061257e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.061386e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.065457e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.065836e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.065940e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.455073 sec - 8,445,387,754 cycles # 3.085 GHz - 17,531,542,705 instructions # 2.08 insn per cycle - 2.853698533 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 +TOTAL : 2.448496 sec + 8,082,390,398 cycles # 2.946 GHz + 16,852,562,382 instructions # 2.09 insn per cycle + 2.848455369 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.224911e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.227118e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.227325e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.245006e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.247251e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.247453e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.995006 sec - 13,041,364,906 cycles # 3.027 GHz - 28,190,198,598 instructions # 2.16 insn per cycle - 4.376446903 seconds time elapsed +TOTAL : 4.002127 sec + 13,348,526,839 cycles # 3.088 GHz + 31,140,905,358 instructions # 2.33 insn per cycle + 4.382097820 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 Avg ME (F77/CUDA) = 9.8722595284406640E-003 Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.492023e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.492274e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.492274e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.053587e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.053836e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.053836e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.223749 sec - 18,981,815,870 cycles # 3.050 GHz - 55,179,677,185 instructions # 2.91 insn per cycle - 6.230649212 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.566168 sec + 18,831,689,747 cycles # 2.868 GHz + 53,916,332,004 instructions # 2.86 insn per cycle + 6.572689464 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.671790e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.671881e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.671881e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.663489e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.663581e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.663581e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.166948 sec - 9,790,094,499 cycles # 3.091 GHz - 27,056,149,583 instructions # 2.76 insn per cycle - 3.181218090 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.182674 sec + 9,806,871,766 cycles # 3.081 GHz + 27,093,022,297 instructions # 2.76 insn per cycle + 3.192772007 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.621678e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.622134e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.622134e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.630162e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.630605e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.630605e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.466253 sec - 4,240,132,958 cycles # 2.890 GHz - 9,565,614,864 instructions # 2.26 insn per cycle - 1.477574402 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) +TOTAL : 1.462430 sec + 4,231,767,010 cycles # 2.892 GHz + 9,562,001,834 instructions # 2.26 insn per cycle + 1.472832936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.190305e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.190882e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.190882e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.135973e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.136556e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.136556e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.269480 sec - 3,685,978,412 cycles # 2.903 GHz - 8,451,253,639 instructions # 2.29 insn per cycle - 1.282888481 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) +TOTAL : 1.282131 sec + 3,734,243,960 cycles # 2.905 GHz + 8,486,594,514 instructions # 2.27 insn per cycle + 1.294140643 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.621525e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.622079e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.622079e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.702281e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.702851e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.702851e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.466027 sec - 2,777,501,738 cycles # 1.893 GHz - 4,249,530,672 instructions # 1.53 insn per cycle - 1.478225043 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) +TOTAL : 1.432645 sec + 2,701,519,987 cycles # 1.882 GHz + 4,274,080,381 instructions # 1.58 insn per cycle + 1.444722496 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 1d5f961b11..7163808f45 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-05_21:59:26 +DATE: 2024-03-01_03:17:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.064929e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.065860e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.065860e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.068445e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.069395e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.069395e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.360501 sec - 8,279,629,193 cycles # 3.090 GHz - 17,543,064,215 instructions # 2.12 insn per cycle - 2.736536703 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +TOTAL : 2.373786 sec + 8,212,794,649 cycles # 3.050 GHz + 17,373,508,782 instructions # 2.12 insn per cycle + 2.749788140 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,166 +72,166 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.231256e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.262658e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.262658e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.191805e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.223957e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.223957e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.980652 sec - 13,263,852,327 cycles # 3.083 GHz - 29,000,010,094 instructions # 2.19 insn per cycle - 4.358270161 seconds time elapsed +TOTAL : 3.992060 sec + 13,207,906,873 cycles # 3.062 GHz + 30,525,969,027 instructions # 2.31 insn per cycle + 4.371813741 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 Avg ME (F77/CUDA) = 9.8722595284406640E-003 Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.470778e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.471016e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.471016e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.148706e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.148931e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.148931e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.238189 sec - 18,981,625,757 cycles # 3.042 GHz - 55,182,170,559 instructions # 2.91 insn per cycle - 6.243020322 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.484661 sec + 18,737,465,302 cycles # 2.888 GHz + 53,915,906,594 instructions # 2.88 insn per cycle + 6.488680620 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.629833e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.629925e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.629925e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.664837e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.664944e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.664944e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.243616 sec - 9,801,779,820 cycles # 3.019 GHz - 27,057,747,913 instructions # 2.76 insn per cycle - 3.248379444 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.177972 sec + 9,794,551,146 cycles # 3.079 GHz + 27,093,049,280 instructions # 2.77 insn per cycle + 3.182112356 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.634061e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.634495e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.634495e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.541461e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.541883e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.541883e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.458634 sec - 4,233,960,705 cycles # 2.895 GHz - 9,565,082,926 instructions # 2.26 insn per cycle - 1.463398878 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) +TOTAL : 1.495047 sec + 4,300,282,840 cycles # 2.870 GHz + 9,561,701,370 instructions # 2.22 insn per cycle + 1.499121189 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.157153e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.157728e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.157728e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.118490e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.119048e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.119048e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.276005 sec - 3,691,081,174 cycles # 2.884 GHz - 8,450,630,071 instructions # 2.29 insn per cycle - 1.280852479 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) +TOTAL : 1.287264 sec + 3,730,461,014 cycles # 2.891 GHz + 8,485,603,542 instructions # 2.27 insn per cycle + 1.291227222 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.758568e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.759161e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.759161e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.742786e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.743427e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.743427e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.411378 sec - 2,685,131,174 cycles # 1.898 GHz - 4,248,751,291 instructions # 1.58 insn per cycle - 1.415941060 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) +TOTAL : 1.415968 sec + 2,690,639,160 cycles # 1.896 GHz + 4,273,336,878 instructions # 1.59 insn per cycle + 1.420067464 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 3eccb964a9..fcaae9673e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-05_21:18:04 +DATE: 2024-03-01_02:38:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.070259e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.070667e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.070770e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.066781e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.067205e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.067339e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.451051 sec - 8,371,838,735 cycles # 3.054 GHz - 17,036,301,719 instructions # 2.03 insn per cycle - 2.850514565 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 +TOTAL : 2.446944 sec + 8,408,759,874 cycles # 3.068 GHz + 18,673,492,162 instructions # 2.22 insn per cycle + 2.843675081 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.279514e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.281799e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.282039e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.258123e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.260337e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.260588e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.976549 sec - 13,229,619,550 cycles # 3.075 GHz - 30,205,700,049 instructions # 2.28 insn per cycle - 4.358227522 seconds time elapsed +TOTAL : 3.986190 sec + 13,309,313,958 cycles # 3.084 GHz + 29,253,936,467 instructions # 2.20 insn per cycle + 4.370982628 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 Avg ME (F77/CUDA) = 9.8722595284406640E-003 Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.497915e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.498146e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.498146e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.505940e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.506196e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.506196e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.221965 sec - 19,147,431,688 cycles # 3.078 GHz - 55,160,387,913 instructions # 2.88 insn per cycle - 6.228667213 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.219195 sec + 18,809,079,145 cycles # 3.025 GHz + 53,925,834,666 instructions # 2.87 insn per cycle + 6.232860023 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.664813e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.664906e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.664906e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.661174e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.661266e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.661266e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.176500 sec - 9,809,166,525 cycles # 3.085 GHz - 27,064,727,613 instructions # 2.76 insn per cycle - 3.187515497 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.189478 sec + 9,805,870,159 cycles # 3.076 GHz + 27,091,831,447 instructions # 2.76 insn per cycle + 3.203897537 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.612667e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.613093e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.613093e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.622791e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.623217e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.623217e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.470168 sec - 4,255,189,971 cycles # 2.893 GHz - 9,569,385,070 instructions # 2.25 insn per cycle - 1.482181474 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84249) (512y: 0) (512z: 0) +TOTAL : 1.464714 sec + 4,224,699,489 cycles # 2.882 GHz + 9,562,401,622 instructions # 2.26 insn per cycle + 1.476328883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.143983e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.144622e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.144622e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.104704e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.105332e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.105332e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.282751 sec - 3,732,361,171 cycles # 2.906 GHz - 8,454,728,771 instructions # 2.27 insn per cycle - 1.294623526 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79386) (512y: 90) (512z: 0) +TOTAL : 1.294499 sec + 3,723,740,700 cycles # 2.874 GHz + 8,486,051,495 instructions # 2.28 insn per cycle + 1.308410916 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.739856e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.740406e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.740406e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.737812e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.738457e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.738457e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.420004 sec - 2,682,413,534 cycles # 1.887 GHz - 4,250,779,821 instructions # 1.58 insn per cycle - 1.433022366 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 90) (512z:78289) +TOTAL : 1.421818 sec + 2,699,411,216 cycles # 1.899 GHz + 4,277,531,970 instructions # 1.58 insn per cycle + 1.435104148 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index fbcfae640e..e89ab34326 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-05_21:19:08 +DATE: 2024-03-01_02:39:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.757763e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.758621e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.758964e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.768224e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.769082e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.769342e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.692068 sec - 5,923,762,775 cycles # 3.061 GHz - 11,779,158,242 instructions # 1.99 insn per cycle - 2.045271179 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 +TOTAL : 1.706494 sec + 5,724,877,835 cycles # 2.946 GHz + 11,350,286,337 instructions # 1.98 insn per cycle + 2.064496697 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.306138e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.306904e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.307028e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.316243e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.317022e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.317120e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.920494 sec - 6,750,155,929 cycles # 3.075 GHz - 14,446,579,365 instructions # 2.14 insn per cycle - 2.254837097 seconds time elapsed +TOTAL : 1.926202 sec + 6,794,636,243 cycles # 3.076 GHz + 13,931,883,029 instructions # 2.05 insn per cycle + 2.265774235 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.849636e-03 Avg ME (F77/CUDA) = 9.8712405367667715E-003 Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.308137e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.308421e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.308421e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.967764e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.968029e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.968029e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.679908 sec - 17,563,953,137 cycles # 3.092 GHz - 51,786,319,770 instructions # 2.95 insn per cycle - 5.686611091 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.899633 sec + 18,012,008,843 cycles # 3.055 GHz + 53,588,806,253 instructions # 2.98 insn per cycle + 5.906269981 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087330436E-003 -Relative difference = 2.119555946686223e-08 +Avg ME (F77/C++) = 9.8479612087541066E-003 +Relative difference = 2.1197698286506752e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.625271e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.625713e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.625713e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.465861 sec - 4,536,409,404 cycles # 3.093 GHz - 13,759,557,462 instructions # 3.03 insn per cycle - 1.477471378 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.554445e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.554907e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.554907e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.492504 sec + 4,596,969,768 cycles # 3.077 GHz + 13,763,413,131 instructions # 2.99 insn per cycle + 1.508036951 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +Avg ME (F77/C++) = 9.8479546896527003E-003 +Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.224940e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.226824e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.226824e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.739518 sec - 2,138,196,753 cycles # 2.882 GHz - 4,827,470,131 instructions # 2.26 insn per cycle - 0.750354876 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.129307e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.130988e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.130988e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.749250 sec + 2,146,538,234 cycles # 2.864 GHz + 4,817,770,938 instructions # 2.24 insn per cycle + 0.763621351 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.523869e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.525949e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.525949e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.709067 sec - 1,880,974,593 cycles # 2.645 GHz - 4,259,949,962 instructions # 2.26 insn per cycle - 0.721507274 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.184924e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.187225e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.187225e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.652928 sec + 1,865,233,671 cycles # 2.849 GHz + 4,274,819,205 instructions # 2.29 insn per cycle + 0.666710238 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.449771e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.452110e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.452110e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.718076 sec - 1,359,361,418 cycles # 1.894 GHz - 2,148,710,095 instructions # 1.58 insn per cycle - 0.730309288 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) +EvtsPerSec[Rmb+ME] (23) = ( 7.469221e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.471533e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.471533e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.715424 sec + 1,360,172,621 cycles # 1.900 GHz + 2,159,744,323 instructions # 1.59 insn per cycle + 0.729957103 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982958280E-003 +Relative difference = 2.0044092642523172e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index bc5d9230e1..684ca24c1f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-05_22:00:29 +DATE: 2024-03-01_03:18:37 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.808944e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.810771e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.810771e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.798857e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.800593e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.800593e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.596030 sec - 5,707,463,906 cycles # 3.053 GHz - 12,247,898,086 instructions # 2.15 insn per cycle - 1.926904704 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +TOTAL : 1.598425 sec + 5,724,594,753 cycles # 3.063 GHz + 12,186,790,592 instructions # 2.13 insn per cycle + 1.928350107 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,169 +72,169 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.304775e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.317135e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.317135e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.285950e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.298387e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.298387e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.879824 sec - 6,584,846,738 cycles # 3.060 GHz - 14,435,491,289 instructions # 2.19 insn per cycle - 2.211594164 seconds time elapsed +TOTAL : 1.887489 sec + 6,620,617,732 cycles # 3.045 GHz + 14,303,245,528 instructions # 2.16 insn per cycle + 2.231962749 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.849636e-03 Avg ME (F77/CUDA) = 9.8712405367667715E-003 Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.283170e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.283458e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.283458e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.094412e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.094687e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.094687e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.693374 sec - 17,594,854,342 cycles # 3.089 GHz - 51,786,439,660 instructions # 2.94 insn per cycle - 5.698505906 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.812831 sec + 17,931,583,834 cycles # 3.083 GHz + 53,588,775,363 instructions # 2.99 insn per cycle + 5.816760256 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087330436E-003 -Relative difference = 2.119555946686223e-08 +Avg ME (F77/C++) = 9.8479612087541066E-003 +Relative difference = 2.1197698286506752e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.591153e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.591586e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.591586e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.476060 sec - 4,558,398,705 cycles # 3.082 GHz - 13,759,164,758 instructions # 3.02 insn per cycle - 1.481085306 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.573130e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.573569e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.573569e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.483014 sec + 4,585,157,051 cycles # 3.085 GHz + 13,762,636,955 instructions # 3.00 insn per cycle + 1.487033664 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +Avg ME (F77/C++) = 9.8479546896527003E-003 +Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.012684e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.014407e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.014407e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.758517 sec - 2,146,491,468 cycles # 2.816 GHz - 4,826,881,543 instructions # 2.25 insn per cycle - 0.763270552 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.234993e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.236702e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.236702e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.734407 sec + 2,124,324,714 cycles # 2.880 GHz + 4,817,114,861 instructions # 2.27 insn per cycle + 0.738469635 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.114888e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.117040e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.117040e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.655572 sec - 1,897,429,617 cycles # 2.879 GHz - 4,259,285,185 instructions # 2.24 insn per cycle - 0.660139204 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.746826e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.748881e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.748881e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.686036 sec + 1,868,608,359 cycles # 2.710 GHz + 4,274,464,507 instructions # 2.29 insn per cycle + 0.690085324 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.527218e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.529426e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.529426e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.707913 sec - 1,351,788,067 cycles # 1.900 GHz - 2,148,014,763 instructions # 1.59 insn per cycle - 0.712542671 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) +EvtsPerSec[Rmb+ME] (23) = ( 7.587479e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.589999e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.589999e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.701778 sec + 1,356,865,477 cycles # 1.924 GHz + 2,159,196,207 instructions # 1.59 insn per cycle + 0.705773287 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982958280E-003 +Relative difference = 2.0044092642523172e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index b68e70f1e6..2af18ad9d5 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-05_21:19:55 +DATE: 2024-03-01_02:40:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.769637e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.770485e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.770746e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.765595e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.766455e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.766757e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.691744 sec - 5,877,714,864 cycles # 3.043 GHz - 12,600,922,025 instructions # 2.14 insn per cycle - 2.042117654 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 +TOTAL : 1.693781 sec + 5,858,518,501 cycles # 3.029 GHz + 12,487,165,720 instructions # 2.13 insn per cycle + 2.044833380 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.329038e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.329805e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.329898e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.312075e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.312852e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.312969e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.930264 sec - 6,710,757,777 cycles # 3.034 GHz - 14,507,110,241 instructions # 2.16 insn per cycle - 2.271397744 seconds time elapsed +TOTAL : 1.933893 sec + 6,737,061,424 cycles # 3.047 GHz + 14,801,104,127 instructions # 2.20 insn per cycle + 2.267780802 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.849636e-03 Avg ME (F77/CUDA) = 9.8712405367667715E-003 Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.294726e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.295036e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.295036e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.922433e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.922702e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.922702e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.691679 sec - 17,566,579,323 cycles # 3.086 GHz - 51,758,460,980 instructions # 2.95 insn per cycle - 5.698578649 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.925615 sec + 17,989,215,363 cycles # 3.036 GHz + 53,579,777,630 instructions # 2.98 insn per cycle + 5.931642569 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087313262E-003 -Relative difference = 2.1195385077844924e-08 +Avg ME (F77/C++) = 9.8479612087582491E-003 +Relative difference = 2.1198118933954545e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.616176e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.616676e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.616676e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.469152 sec - 4,545,584,305 cycles # 3.093 GHz - 13,757,988,516 instructions # 3.03 insn per cycle - 1.485351175 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.564689e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.565144e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.565144e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.490731 sec + 4,558,556,123 cycles # 3.055 GHz + 13,757,084,226 instructions # 3.02 insn per cycle + 1.501811120 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +Avg ME (F77/C++) = 9.8479546896225560E-003 +Relative difference = 3.151694379513441e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.204235e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.206094e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.206094e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.739543 sec - 2,146,737,695 cycles # 2.895 GHz - 4,826,497,019 instructions # 2.25 insn per cycle - 0.752280009 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84793) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.177084e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.178836e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.178836e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.743943 sec + 2,139,817,263 cycles # 2.875 GHz + 4,819,936,629 instructions # 2.25 insn per cycle + 0.755587883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161070967E-003 +Relative difference = 1.8588234562202478e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.297842e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.300329e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.300329e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.643256 sec - 1,855,486,706 cycles # 2.874 GHz - 4,258,716,994 instructions # 2.30 insn per cycle - 0.655646533 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79978) (512y: 46) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.229829e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.232369e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.232369e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.647666 sec + 1,869,906,105 cycles # 2.875 GHz + 4,276,791,956 instructions # 2.29 insn per cycle + 0.664053491 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161070967E-003 +Relative difference = 1.8588234562202478e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.510694e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.512912e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.512912e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.712983 sec - 1,353,886,497 cycles # 1.900 GHz - 2,147,755,049 instructions # 1.59 insn per cycle - 0.726727549 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2776) (512y: 44) (512z:78501) +EvtsPerSec[Rmb+ME] (23) = ( 7.437378e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.439646e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.439646e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.718650 sec + 1,366,457,842 cycles # 1.901 GHz + 2,166,062,692 instructions # 1.59 insn per cycle + 0.731356674 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3474) (512y: 34) (512z:79492) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982955140E-003 +Relative difference = 2.0044060904369713e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 16d218cf03..c639834643 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-05_21:20:42 +DATE: 2024-03-01_02:41:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.694986e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.695598e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.695826e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.691286e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.691795e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.691928e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.201129 sec - 7,429,679,479 cycles # 2.975 GHz - 16,463,728,646 instructions # 2.22 insn per cycle - 2.609456882 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 +TOTAL : 2.198692 sec + 7,604,134,018 cycles # 3.054 GHz + 16,321,512,266 instructions # 2.15 insn per cycle + 2.594812497 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.108333e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.108647e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.108689e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.112457e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112776e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112803e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.408187 sec - 11,495,638,263 cycles # 3.080 GHz - 26,728,549,756 instructions # 2.33 insn per cycle - 3.788711649 seconds time elapsed +TOTAL : 3.397194 sec + 11,475,121,938 cycles # 3.084 GHz + 26,000,925,285 instructions # 2.27 insn per cycle + 3.777191130 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 Avg ME (F77/CUDA) = 9.8722599015656498E-003 Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.471059e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.471318e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.471318e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.034566e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.034790e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.034790e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.240779 sec - 19,232,906,655 cycles # 3.082 GHz - 55,389,629,933 instructions # 2.88 insn per cycle - 6.248170189 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.578920 sec + 19,096,747,933 cycles # 2.903 GHz + 54,154,360,803 instructions # 2.84 insn per cycle + 6.585797711 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.635227e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.635341e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.635341e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.634173e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.634271e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.634271e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.239255 sec - 9,369,871,652 cycles # 2.893 GHz - 25,875,855,274 instructions # 2.76 insn per cycle - 3.251872137 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.239396 sec + 9,369,032,238 cycles # 2.892 GHz + 26,160,172,444 instructions # 2.79 insn per cycle + 3.251135271 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.809685e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.810215e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.810215e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.697087e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.697545e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.697545e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.395288 sec - 4,039,725,857 cycles # 2.895 GHz - 9,120,228,738 instructions # 2.26 insn per cycle - 1.407748801 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83820) (512y: 0) (512z: 0) +TOTAL : 1.438333 sec + 4,079,178,507 cycles # 2.840 GHz + 9,228,646,226 instructions # 2.26 insn per cycle + 1.450605350 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.410989e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.411632e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.411632e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.363646e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.364393e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.364393e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.205151 sec - 3,518,398,005 cycles # 2.916 GHz - 8,030,302,177 instructions # 2.28 insn per cycle - 1.218344654 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79028) (512y: 70) (512z: 0) +TOTAL : 1.218747 sec + 3,509,445,956 cycles # 2.879 GHz + 8,176,263,750 instructions # 2.33 insn per cycle + 1.230057623 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.913569e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.914240e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.914240e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.850358e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.851005e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.851005e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.360336 sec - 2,599,600,994 cycles # 1.911 GHz - 4,076,381,200 instructions # 1.57 insn per cycle - 1.381113740 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1903) (512y: 70) (512z:78042) +TOTAL : 1.381042 sec + 2,620,845,167 cycles # 1.898 GHz + 4,155,618,865 instructions # 1.59 insn per cycle + 1.395419124 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index d6475b63f8..ace04f97d7 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-02-05_21:21:43 +DATE: 2024-03-01_02:42:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.694409e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.695049e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.695333e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.691636e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.692217e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.692361e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.172258 sec - 7,378,666,393 cycles # 2.961 GHz - 16,226,225,965 instructions # 2.20 insn per cycle - 2.560679655 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 +TOTAL : 2.171682 sec + 7,616,890,265 cycles # 3.058 GHz + 16,356,089,453 instructions # 2.15 insn per cycle + 2.553555988 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.109841e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.110156e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.110184e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.106871e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107188e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107217e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.405673 sec - 11,489,330,658 cycles # 3.080 GHz - 26,309,022,266 instructions # 2.29 insn per cycle - 3.786697195 seconds time elapsed +TOTAL : 3.406322 sec + 11,260,210,288 cycles # 3.017 GHz + 25,906,087,343 instructions # 2.30 insn per cycle + 3.788413520 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 Avg ME (F77/CUDA) = 9.8722599015656498E-003 Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.484297e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.484561e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.484561e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.951672e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.951882e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.951882e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.228499 sec - 19,192,891,143 cycles # 3.080 GHz - 55,417,637,865 instructions # 2.89 insn per cycle - 6.232958003 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.644473 sec + 19,262,229,911 cycles # 2.898 GHz + 54,152,472,780 instructions # 2.81 insn per cycle + 6.648593616 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.643545e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.643648e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.643648e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.623003e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.623092e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623092e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.218830 sec - 9,334,199,176 cycles # 2.897 GHz - 25,822,511,752 instructions # 2.77 insn per cycle - 3.230562767 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.257928 sec + 9,349,757,536 cycles # 2.867 GHz + 26,077,919,393 instructions # 2.79 insn per cycle + 3.270643449 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.834505e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.834980e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.834980e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.760154e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.760626e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.760626e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.382141 sec - 4,015,593,467 cycles # 2.897 GHz - 9,098,984,041 instructions # 2.27 insn per cycle - 1.393724809 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83378) (512y: 0) (512z: 0) +TOTAL : 1.408906 sec + 4,059,558,991 cycles # 2.874 GHz + 9,213,876,384 instructions # 2.27 insn per cycle + 1.420092908 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.422794e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.423431e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.423431e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.304001e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.304638e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.304638e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.198884 sec - 3,486,499,273 cycles # 2.899 GHz - 8,010,159,141 instructions # 2.30 insn per cycle - 1.211575244 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:78540) (512y: 70) (512z: 0) +TOTAL : 1.231479 sec + 3,558,951,872 cycles # 2.881 GHz + 8,168,148,330 instructions # 2.30 insn per cycle + 1.241837128 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.921332e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.921975e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.921975e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.836982e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.837574e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837574e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.351774 sec - 2,598,841,557 cycles # 1.917 GHz - 4,065,366,264 instructions # 1.56 insn per cycle - 1.362524517 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1420) (512y: 70) (512z:78026) +TOTAL : 1.381601 sec + 2,619,896,392 cycles # 1.892 GHz + 4,153,497,129 instructions # 1.59 insn per cycle + 1.390536918 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 3f81b13e98..4f705cbffa 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-05_21:15:14 +DATE: 2024-03-01_02:35:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.669570e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.293826e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.681058e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.695225e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.365990e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.743234e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.446507 sec - 1,970,176,956 cycles # 2.985 GHz - 2,781,026,447 instructions # 1.41 insn per cycle - 0.734838981 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +TOTAL : 0.446213 sec + 1,972,017,701 cycles # 2.992 GHz + 2,778,256,208 instructions # 1.41 insn per cycle + 0.734930275 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.263975e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.148170e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.555265e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.267244e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.134450e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.554945e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.529208 sec - 2,301,609,838 cycles # 3.000 GHz - 3,254,587,662 instructions # 1.41 insn per cycle - 0.824893939 seconds time elapsed +TOTAL : 0.528224 sec + 2,304,762,750 cycles # 3.008 GHz + 3,294,040,641 instructions # 1.43 insn per cycle + 0.823439197 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.424749e-01 Avg ME (F77/CUDA) = 0.14247482467490466 Relative difference = 5.286902838873106e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.056266e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.077334e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.077334e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.091452e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.114280e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.114280e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.573129 sec - 4,877,263,280 cycles # 3.093 GHz - 13,800,372,792 instructions # 2.83 insn per cycle - 1.580008121 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.522856 sec + 4,703,604,569 cycles # 3.081 GHz + 13,462,460,024 instructions # 2.86 insn per cycle + 1.529442917 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 +Avg ME (F77/C++) = 0.14247482467499484 +Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.047638e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.125957e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.125957e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.951069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.025448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.025448e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.821495 sec - 2,560,095,543 cycles # 3.099 GHz - 7,400,936,297 instructions # 2.89 insn per cycle - 0.836848289 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.861454 sec + 2,622,516,081 cycles # 3.029 GHz + 7,553,226,055 instructions # 2.88 insn per cycle + 0.875162721 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 +Avg ME (F77/C++) = 0.14247482467499478 +Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.404910e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.625111e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.625111e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.378326e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598362e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.502830 sec - 1,476,855,789 cycles # 2.911 GHz - 3,136,939,664 instructions # 2.12 insn per cycle - 0.513981856 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) +TOTAL : 0.506903 sec + 1,479,878,074 cycles # 2.896 GHz + 3,120,545,502 instructions # 2.11 insn per cycle + 0.521612120 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.875594e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.165680e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.165680e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.763846e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.033394e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.033394e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444549 sec - 1,308,091,426 cycles # 2.913 GHz - 2,923,486,765 instructions # 2.23 insn per cycle - 0.459054653 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) +TOTAL : 0.456990 sec + 1,342,026,946 cycles # 2.909 GHz + 2,982,806,139 instructions # 2.22 insn per cycle + 0.473253864 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.700738e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.840729e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.840729e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.552530e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.674072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.674072e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.630385 sec - 1,264,655,111 cycles # 1.992 GHz - 1,899,913,388 instructions # 1.50 insn per cycle - 0.642819266 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) +TOTAL : 0.665523 sec + 1,326,336,546 cycles # 1.981 GHz + 1,954,248,677 instructions # 1.47 insn per cycle + 0.676015017 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 55c8eeafda..7838899130 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-05_21:57:45 +DATE: 2024-03-01_03:15:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.677163e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.149342e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.149342e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.566228e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.132243e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.132243e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.469301 sec - 2,062,711,084 cycles # 3.004 GHz - 3,076,481,136 instructions # 1.49 insn per cycle - 0.744248998 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +TOTAL : 0.471075 sec + 2,051,009,542 cycles # 3.009 GHz + 3,055,349,974 instructions # 1.49 insn per cycle + 0.738770181 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,166 +72,166 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.258171e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.292526e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.292526e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.288005e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.253544e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.253544e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.749639 sec - 3,027,636,992 cycles # 3.008 GHz - 4,599,039,489 instructions # 1.52 insn per cycle - 1.064296399 seconds time elapsed +TOTAL : 0.748132 sec + 3,046,262,026 cycles # 3.023 GHz + 4,636,082,832 instructions # 1.52 insn per cycle + 1.065675268 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.424749e-01 Avg ME (F77/CUDA) = 0.14247482467490466 Relative difference = 5.286902838873106e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.051692e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.072651e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.072651e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.089966e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112868e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112868e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.584692 sec - 4,904,139,806 cycles # 3.087 GHz - 13,805,381,065 instructions # 2.82 insn per cycle - 1.589628039 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.529900 sec + 4,728,814,715 cycles # 3.083 GHz + 13,467,526,764 instructions # 2.85 insn per cycle + 1.534252544 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 +Avg ME (F77/C++) = 0.14247482467499484 +Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.020153e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.098786e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.098786e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.949285e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.024056e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.024056e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.839714 sec - 2,595,460,882 cycles # 3.078 GHz - 7,449,928,495 instructions # 2.87 insn per cycle - 0.844799137 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.869004 sec + 2,652,875,861 cycles # 3.039 GHz + 7,602,145,003 instructions # 2.87 insn per cycle + 0.873736497 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 +Avg ME (F77/C++) = 0.14247482467499478 +Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.334523e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.553039e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.553039e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.146841e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.351542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.351542e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.521012 sec - 1,512,179,612 cycles # 2.880 GHz - 3,186,720,547 instructions # 2.11 insn per cycle - 0.525991705 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) +TOTAL : 0.550316 sec + 1,514,222,662 cycles # 2.732 GHz + 3,170,467,422 instructions # 2.09 insn per cycle + 0.554802806 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.523854e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.791701e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.791701e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.650572e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.918840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.918840e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.495383 sec - 1,347,648,083 cycles # 2.721 GHz - 2,975,110,368 instructions # 2.21 insn per cycle - 0.500624015 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) +TOTAL : 0.478096 sec + 1,374,122,120 cycles # 2.850 GHz + 3,032,631,270 instructions # 2.21 insn per cycle + 0.482825268 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.655028e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.787748e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.787748e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.537453e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.662993e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.662993e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.647387 sec - 1,294,117,885 cycles # 1.986 GHz - 1,936,924,916 instructions # 1.50 insn per cycle - 0.652437474 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) +TOTAL : 0.675099 sec + 1,354,490,621 cycles # 1.996 GHz + 1,991,409,834 instructions # 1.47 insn per cycle + 0.679620955 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index a69eb870b8..1de3a7df55 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-05_21:15:32 +DATE: 2024-03-01_02:36:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.651751e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.189507e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.538308e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.634258e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.200936e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.553712e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.445375 sec - 2,003,165,320 cycles # 3.004 GHz - 2,827,025,873 instructions # 1.41 insn per cycle - 0.738979058 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +TOTAL : 0.443315 sec + 2,012,981,464 cycles # 3.013 GHz + 2,802,025,362 instructions # 1.39 insn per cycle + 0.744859677 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.236829e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.009410e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.414016e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.239420e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.026633e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.428795e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.528660 sec - 2,299,723,777 cycles # 3.006 GHz - 3,273,869,892 instructions # 1.42 insn per cycle - 0.825223447 seconds time elapsed +TOTAL : 0.526694 sec + 2,300,725,267 cycles # 3.007 GHz + 3,244,738,845 instructions # 1.41 insn per cycle + 0.822736768 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.424749e-01 Avg ME (F77/CUDA) = 0.14247482467490466 Relative difference = 5.286902838873106e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.957243e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.016444e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.016444e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.093034e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.115683e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.115683e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.668980 sec - 4,884,718,098 cycles # 2.919 GHz - 13,808,181,391 instructions # 2.83 insn per cycle - 1.676281445 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.520645 sec + 4,710,102,553 cycles # 3.090 GHz + 13,456,334,828 instructions # 2.86 insn per cycle + 1.527404362 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 +Avg ME (F77/C++) = 0.14247482467499484 +Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.036799e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.115591e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.115591e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.995699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.070809e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.070809e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.825393 sec - 2,561,451,557 cycles # 3.087 GHz - 7,406,342,161 instructions # 2.89 insn per cycle - 0.839309204 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2892) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.841713 sec + 2,618,818,041 cycles # 3.096 GHz + 7,552,217,415 instructions # 2.88 insn per cycle + 0.854217946 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 +Avg ME (F77/C++) = 0.14247482467499478 +Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.385477e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.609605e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.609605e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.378534e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.594400e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.594400e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.505702 sec - 1,479,487,536 cycles # 2.900 GHz - 3,137,175,164 instructions # 2.12 insn per cycle - 0.519573265 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2875) (512y: 0) (512z: 0) +TOTAL : 0.506766 sec + 1,482,977,233 cycles # 2.909 GHz + 3,119,381,568 instructions # 2.10 insn per cycle + 0.519705447 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.870393e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.157477e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.157477e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.757237e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.033602e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.033602e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444867 sec - 1,304,235,883 cycles # 2.904 GHz - 2,924,972,743 instructions # 2.24 insn per cycle - 0.460522710 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 93) (512z: 0) +TOTAL : 0.457488 sec + 1,337,095,985 cycles # 2.896 GHz + 2,979,946,273 instructions # 2.23 insn per cycle + 0.473330982 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.714778e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.852940e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.852940e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.547680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.672650e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.672650e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.626914 sec - 1,263,639,141 cycles # 2.001 GHz - 1,899,641,042 instructions # 1.50 insn per cycle - 0.637674529 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1118) (512y: 62) (512z: 2165) +TOTAL : 0.666550 sec + 1,326,556,264 cycles # 1.978 GHz + 1,952,513,162 instructions # 1.47 insn per cycle + 0.681133765 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index a421dad089..4d40239a82 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-05_21:15:50 +DATE: 2024-03-01_02:36:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.317512e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.217111e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.357151e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.367019e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.211392e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.351303e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.441355 sec - 1,952,652,349 cycles # 2.978 GHz - 2,757,723,921 instructions # 1.41 insn per cycle - 0.728980619 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +TOTAL : 0.439896 sec + 1,919,384,660 cycles # 2.928 GHz + 2,652,462,812 instructions # 1.38 insn per cycle + 0.728915663 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.287167e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.812013e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.961599e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.249516e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.812359e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.959123e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.476575 sec - 2,106,169,764 cycles # 3.003 GHz - 2,971,298,740 instructions # 1.41 insn per cycle - 0.760618216 seconds time elapsed +TOTAL : 0.476459 sec + 2,111,535,021 cycles # 3.010 GHz + 2,984,192,787 instructions # 1.41 insn per cycle + 0.759063881 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.424226e-01 Avg ME (F77/CUDA) = 0.14247488790821983 Relative difference = 0.00036713209996037764 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.187334e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.215295e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.215295e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.400396 sec - 4,340,193,687 cycles # 3.091 GHz - 12,596,376,231 instructions # 2.90 insn per cycle - 1.407492521 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.158503e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184413e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.184413e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.434431 sec + 4,452,862,887 cycles # 3.097 GHz + 13,047,773,125 instructions # 2.93 insn per cycle + 1.440725517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 +Avg ME (F77/C++) = 0.14246857540270419 +Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.308372e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.534483e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.534483e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.101216e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.298192e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.298192e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.515722 sec - 1,591,767,628 cycles # 3.061 GHz - 4,246,687,782 instructions # 2.67 insn per cycle - 0.530129556 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.547840 sec + 1,698,684,785 cycles # 3.077 GHz + 4,513,142,797 instructions # 2.66 insn per cycle + 0.560862800 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 +Avg ME (F77/C++) = 0.14246859631675157 +Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.065695e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.836938e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.836938e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.089458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.856206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.856206e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.290244 sec - 849,117,535 cycles # 2.882 GHz - 1,915,632,467 instructions # 2.26 insn per cycle - 0.311212153 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) +TOTAL : 0.289099 sec + 853,788,001 cycles # 2.912 GHz + 1,897,231,072 instructions # 2.22 insn per cycle + 0.300313484 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.473801e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.375742e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.375742e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.510175e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.400201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.400201e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.273964 sec - 780,140,985 cycles # 2.799 GHz - 1,797,931,367 instructions # 2.30 insn per cycle - 0.284454132 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) +TOTAL : 0.271830 sec + 801,479,133 cycles # 2.904 GHz + 1,820,357,988 instructions # 2.27 insn per cycle + 0.285846070 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.679551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.143053e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.143053e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.997156e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.506085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.506085e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.373022 sec - 717,324,753 cycles # 1.900 GHz - 1,287,933,216 instructions # 1.80 insn per cycle - 0.387579974 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) +TOTAL : 0.349567 sec + 731,841,700 cycles # 2.069 GHz + 1,305,336,291 instructions # 1.78 insn per cycle + 0.359850888 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 +Avg ME (F77/C++) = 0.14247489383243206 +Relative difference = 4.32888033512879e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 6c6a59a9fb..441da29ffb 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,57 +13,57 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-05_21:58:03 +DATE: 2024-03-01_03:16:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.777207e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.054581e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.054581e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.711602e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.109045e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109045e+07 ) sec^-1 MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.453058 sec - 1,974,608,872 cycles # 2.986 GHz - 2,903,059,814 instructions # 1.47 insn per cycle - 0.721035655 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +TOTAL : 0.448633 sec + 2,014,530,108 cycles # 3.024 GHz + 2,953,646,670 instructions # 1.47 insn per cycle + 0.724573840 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -72,169 +72,169 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.250884e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.608433e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.608433e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.194631e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.629307e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.629307e+07 ) sec^-1 MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.615768 sec - 2,539,106,336 cycles # 2.997 GHz - 3,850,180,903 instructions # 1.52 insn per cycle - 0.905916528 seconds time elapsed +TOTAL : 0.616658 sec + 2,563,348,424 cycles # 3.027 GHz + 3,871,269,369 instructions # 1.51 insn per cycle + 0.904047137 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.424226e-01 Avg ME (F77/CUDA) = 0.14247488790821983 Relative difference = 0.00036713209996037764 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.186508e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.214056e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.214056e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.403761 sec - 4,352,321,749 cycles # 3.092 GHz - 12,600,670,105 instructions # 2.90 insn per cycle - 1.408621157 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.161555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188116e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188116e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.433803 sec + 4,469,694,345 cycles # 3.110 GHz + 13,052,094,019 instructions # 2.92 insn per cycle + 1.437926738 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 +Avg ME (F77/C++) = 0.14246857540270419 +Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.308333e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.536707e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.536707e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.090515e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.286507e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.286507e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.519668 sec - 1,610,612,694 cycles # 3.076 GHz - 4,293,733,776 instructions # 2.67 insn per cycle - 0.524564296 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.554057 sec + 1,716,801,013 cycles # 3.079 GHz + 4,560,314,564 instructions # 2.66 insn per cycle + 0.558193661 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 +Avg ME (F77/C++) = 0.14246859631675157 +Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.070636e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.830253e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.830253e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.984424e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.738205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.738205e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.293851 sec - 866,435,010 cycles # 2.909 GHz - 1,951,871,013 instructions # 2.25 insn per cycle - 0.298758260 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) +TOTAL : 0.297621 sec + 872,015,724 cycles # 2.894 GHz + 1,933,356,220 instructions # 2.22 insn per cycle + 0.301984624 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.668627e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.614860e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.614860e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.471182e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.343667e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.343667e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.269788 sec - 795,276,981 cycles # 2.905 GHz - 1,833,827,446 instructions # 2.31 insn per cycle - 0.274696767 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) +TOTAL : 0.276934 sec + 818,470,682 cycles # 2.917 GHz + 1,856,220,484 instructions # 2.27 insn per cycle + 0.281151541 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.025648e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.546457e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.546457e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.926101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.412906e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.412906e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.352529 sec - 736,964,720 cycles # 2.068 GHz - 1,328,948,572 instructions # 1.80 insn per cycle - 0.357272217 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) +TOTAL : 0.358667 sec + 751,185,964 cycles # 2.073 GHz + 1,346,032,296 instructions # 1.79 insn per cycle + 0.362975431 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 +Avg ME (F77/C++) = 0.14247489383243206 +Relative difference = 4.32888033512879e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index de231e55ec..8918bec5c8 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,211 +13,211 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-05_21:16:07 +DATE: 2024-03-01_02:36:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.412670e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.199054e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.349763e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.307953e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201255e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336658e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.439949 sec - 1,957,075,445 cycles # 2.994 GHz - 2,770,692,821 instructions # 1.42 insn per cycle - 0.720018081 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +TOTAL : 0.436130 sec + 1,959,442,257 cycles # 3.009 GHz + 2,743,667,126 instructions # 1.40 insn per cycle + 0.720037686 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.167952e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.776862e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.920498e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.165076e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.782519e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922757e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.473091 sec - 2,130,521,881 cycles # 3.019 GHz - 3,019,247,264 instructions # 1.42 insn per cycle - 0.763574930 seconds time elapsed +TOTAL : 0.476114 sec + 2,116,952,174 cycles # 3.025 GHz + 3,000,364,507 instructions # 1.42 insn per cycle + 0.758577490 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.424226e-01 Avg ME (F77/CUDA) = 0.14247488790821983 Relative difference = 0.00036713209996037764 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.188596e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.216506e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.216506e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.398310 sec - 4,338,466,293 cycles # 3.094 GHz - 12,587,646,115 instructions # 2.90 insn per cycle - 1.405152993 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.155211e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.181167e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.438010 sec + 4,446,707,539 cycles # 3.084 GHz + 13,028,651,848 instructions # 2.93 insn per cycle + 1.444314220 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 +Avg ME (F77/C++) = 0.14246857540270419 +Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.336768e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.569890e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.569890e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.098425e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.294299e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.294299e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.511225 sec - 1,585,938,113 cycles # 3.076 GHz - 4,241,172,905 instructions # 2.67 insn per cycle - 0.526002477 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3248) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.547784 sec + 1,696,823,876 cycles # 3.074 GHz + 4,509,092,353 instructions # 2.66 insn per cycle + 0.559046282 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 +Avg ME (F77/C++) = 0.14246859631675157 +Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.172477e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.952911e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.952911e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.019219e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.763141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.763141e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.285227 sec - 847,077,737 cycles # 2.926 GHz - 1,913,660,776 instructions # 2.26 insn per cycle - 0.300443120 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3463) (512y: 0) (512z: 0) +TOTAL : 0.292180 sec + 859,590,330 cycles # 2.901 GHz + 1,893,994,453 instructions # 2.20 insn per cycle + 0.304986924 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.733284e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.677333e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.677333e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.549494e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.438482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.438482e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.263291 sec - 776,406,702 cycles # 2.902 GHz - 1,795,697,825 instructions # 2.31 insn per cycle - 0.278345913 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3164) (512y: 15) (512z: 0) +TOTAL : 0.269638 sec + 798,515,936 cycles # 2.915 GHz + 1,816,168,831 instructions # 2.27 insn per cycle + 0.281600896 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.156147e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.704757e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.704757e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.914139e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.405725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.405725e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.339548 sec - 716,982,371 cycles # 2.084 GHz - 1,286,640,400 instructions # 1.79 insn per cycle - 0.353133541 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1709) (512y: 24) (512z: 2387) +TOTAL : 0.355005 sec + 734,840,966 cycles # 2.046 GHz + 1,303,017,912 instructions # 1.77 insn per cycle + 0.365594980 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 +Avg ME (F77/C++) = 0.14247489383243206 +Relative difference = 4.32888033512879e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index caba5422fa..9473075c44 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-05_21:16:24 +DATE: 2024-03-01_02:37:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.665805e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.279968e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.648438e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.657865e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.342545e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.715127e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.442706 sec - 2,029,552,154 cycles # 3.029 GHz - 2,846,442,861 instructions # 1.40 insn per cycle - 0.743425796 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +TOTAL : 0.444064 sec + 2,011,501,510 cycles # 2.996 GHz + 2,813,725,950 instructions # 1.40 insn per cycle + 0.745188123 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.261949e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.123956e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.541939e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.264913e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.129230e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.558122e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.529095 sec - 2,306,980,644 cycles # 3.012 GHz - 3,280,944,770 instructions # 1.42 insn per cycle - 0.823708444 seconds time elapsed +TOTAL : 0.531362 sec + 2,289,898,203 cycles # 2.976 GHz + 3,193,334,828 instructions # 1.39 insn per cycle + 0.827090728 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.424749e-01 Avg ME (F77/CUDA) = 0.14247482577104625 Relative difference = 5.209967070245855e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.060897e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.081794e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.081794e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.087550e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.110443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110443e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.566339 sec - 4,891,432,798 cycles # 3.116 GHz - 13,823,965,188 instructions # 2.83 insn per cycle - 1.573214922 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.528426 sec + 4,733,772,591 cycles # 3.090 GHz + 13,465,129,433 instructions # 2.84 insn per cycle + 1.534888113 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.033305e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.110362e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.110362e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.994397e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.071792e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.071792e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.827032 sec - 2,591,385,710 cycles # 3.116 GHz - 7,349,073,111 instructions # 2.84 insn per cycle - 0.841790047 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2967) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.843067 sec + 2,603,799,246 cycles # 3.073 GHz + 7,385,481,301 instructions # 2.84 insn per cycle + 0.853727039 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.469754e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.701127e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.701127e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.410870e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.639370e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.639370e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.493958 sec - 1,464,063,754 cycles # 2.938 GHz - 3,084,407,899 instructions # 2.11 insn per cycle - 0.506428666 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3008) (512y: 0) (512z: 0) +TOTAL : 0.502006 sec + 1,465,753,503 cycles # 2.896 GHz + 3,056,435,528 instructions # 2.09 insn per cycle + 0.511483566 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.961658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.263124e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.263124e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.873726e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.164501e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.164501e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.435846 sec - 1,279,115,887 cycles # 2.905 GHz - 2,873,181,084 instructions # 2.25 insn per cycle - 0.445725915 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2653) (512y: 96) (512z: 0) +TOTAL : 0.444397 sec + 1,302,869,174 cycles # 2.905 GHz + 2,931,108,724 instructions # 2.25 insn per cycle + 0.456529729 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.608300e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.738304e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.738304e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.488835e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.605728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.605728e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.652237 sec - 1,302,193,887 cycles # 1.983 GHz - 1,914,659,883 instructions # 1.47 insn per cycle - 0.667291833 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 70) (512z: 2164) +TOTAL : 0.681918 sec + 1,362,782,748 cycles # 1.986 GHz + 1,970,355,079 instructions # 1.45 insn per cycle + 0.693685126 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index dea31763dd..f04f8628ac 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d @@ -13,208 +13,208 @@ make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-02-05_21:16:43 +DATE: 2024-03-01_02:37:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.634850e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.135834e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.502178e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.658641e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.216275e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.578681e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.445383 sec - 2,000,203,957 cycles # 3.007 GHz - 2,795,360,403 instructions # 1.40 insn per cycle - 0.738846057 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +TOTAL : 0.445224 sec + 1,992,469,002 cycles # 2.992 GHz + 2,813,148,728 instructions # 1.41 insn per cycle + 0.736789901 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.227172e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.969457e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.380914e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.263173e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.989199e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.385950e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.531264 sec - 2,302,864,776 cycles # 2.996 GHz - 3,298,090,495 instructions # 1.43 insn per cycle - 0.827884323 seconds time elapsed +TOTAL : 0.532147 sec + 2,297,521,664 cycles # 2.990 GHz + 3,210,517,070 instructions # 1.40 insn per cycle + 0.827894226 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.424749e-01 Avg ME (F77/CUDA) = 0.14247482577104625 Relative difference = 5.209967070245855e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.052323e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.073479e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.073479e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.091329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.113996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113996e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.578545 sec - 4,898,302,542 cycles # 3.095 GHz - 13,831,146,011 instructions # 2.82 insn per cycle - 1.585728014 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.523445 sec + 4,724,741,346 cycles # 3.094 GHz + 13,451,257,746 instructions # 2.85 insn per cycle + 1.529633779 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.996784e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.073146e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.073146e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.010329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.087455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.087455e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.842613 sec - 2,603,005,292 cycles # 3.074 GHz - 7,352,584,625 instructions # 2.82 insn per cycle - 0.855614959 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2957) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.835617 sec + 2,595,186,002 cycles # 3.089 GHz + 7,389,201,553 instructions # 2.85 insn per cycle + 0.854907608 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.412508e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.635285e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.635285e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.399802e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.624427e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.624427e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.501624 sec - 1,466,613,920 cycles # 2.898 GHz - 3,084,946,401 instructions # 2.10 insn per cycle - 0.513214103 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2986) (512y: 0) (512z: 0) +TOTAL : 0.503119 sec + 1,466,604,979 cycles # 2.890 GHz + 3,056,260,975 instructions # 2.08 insn per cycle + 0.515296062 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.954805e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.258575e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.258575e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.762321e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.040429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.040429e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.435439 sec - 1,276,676,932 cycles # 2.902 GHz - 2,874,881,412 instructions # 2.25 insn per cycle - 0.450491255 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2636) (512y: 96) (512z: 0) +TOTAL : 0.457389 sec + 1,310,592,019 cycles # 2.838 GHz + 2,931,897,706 instructions # 2.24 insn per cycle + 0.469608344 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.598476e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.727689e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.727689e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.462138e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.577756e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.577756e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.653930 sec - 1,303,406,976 cycles # 1.980 GHz - 1,915,098,748 instructions # 1.47 insn per cycle - 0.670717403 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1476) (512y: 70) (512z: 2164) +TOTAL : 0.689340 sec + 1,364,202,689 cycles # 1.967 GHz + 1,970,285,028 instructions # 1.44 insn per cycle + 0.699058633 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 From f4d951c7ddfc635707c14e0fe5a0628fd4aec0ac Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 1 Mar 2024 08:19:49 +0100 Subject: [PATCH 84/96] [susy2] rerun 18 tmad tests on itscrd90, all ok STARTED AT Fri Mar 1 03:29:12 AM CET 2024 ENDED AT Fri Mar 1 07:51:32 AM CET 2024 Status=0 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt --- .../log_eemumu_mad_d_inl0_hrd0.txt | 210 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0.txt | 214 +++++++-------- .../log_eemumu_mad_m_inl0_hrd0.txt | 202 +++++++------- .../log_ggtt_mad_d_inl0_hrd0.txt | 222 ++++++++-------- .../log_ggtt_mad_f_inl0_hrd0.txt | 242 ++++++++--------- .../log_ggtt_mad_m_inl0_hrd0.txt | 238 ++++++++--------- .../log_ggttg_mad_d_inl0_hrd0.txt | 216 +++++++-------- .../log_ggttg_mad_f_inl0_hrd0.txt | 242 ++++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 226 ++++++++-------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 240 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 246 ++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 246 ++++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 242 ++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 250 +++++++++--------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 232 ++++++++-------- .../log_gqttq_mad_d_inl0_hrd0.txt | 226 ++++++++-------- .../log_gqttq_mad_f_inl0_hrd0.txt | 238 ++++++++--------- .../log_gqttq_mad_m_inl0_hrd0.txt | 234 ++++++++-------- 18 files changed, 2083 insertions(+), 2083 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 5792e7e600..fb2022a061 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,4 +1,4 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' @@ -10,33 +10,33 @@ make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-02-05_22:17:58 +DATE: 2024-03-01_03:35:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.6779s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6698s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6832s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6748s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.71E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1725s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1641s - [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1761s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1679s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,8 +109,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3626s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2752s + [COUNTERS] PROGRAM TOTAL : 0.3673s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s [COUNTERS] Fortran MEs ( 1 ) : 0.0874s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** @@ -132,15 +132,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1765s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1695s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.17E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1745s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661545E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000780E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3758s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2970s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0788s for 90112 events => throughput is 1.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3053s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0752s for 90112 events => throughput is 1.20E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000780E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.119401e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.174335e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.150011e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.235605e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1720s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.97E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1874s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.89E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3238s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2796s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0442s for 90112 events => throughput is 2.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3298s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2854s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0444s for 90112 events => throughput is 2.03E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.960201e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.003456e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.056109e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.071261e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1691s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1662s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.78E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1738s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1708s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.75E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3175s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2842s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0333s for 90112 events => throughput is 2.70E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3165s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2833s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 90112 events => throughput is 2.71E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.667896e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.590204e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.846362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.724231e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1671s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1644s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 2.99E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1747s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1718s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.78E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3086s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2780s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0306s for 90112 events => throughput is 2.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3170s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2851s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0318s for 90112 events => throughput is 2.83E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.811128e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.651963e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.083958e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.888816e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661462E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1714s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1721s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661462E-002) differ by less than 3E-14 (7.771561172376096e-16) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3216s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2847s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 90112 events => throughput is 2.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 90112 events => throughput is 2.28E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,14 +484,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.407049e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.333417e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.476907e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.247580e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5875s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5870s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5894s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5889s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7209s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.82E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7068s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7019s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -560,43 +560,43 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.167668e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.143768e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.968899e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922192e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.735772e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.720542e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.440392e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.434610e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.710260e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.732238e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.012226e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.027929e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.703491e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.748145e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.128966e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.129848e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 2b4c81420f..130936da07 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-02-05_22:18:15 +DATE: 2024-03-01_03:35:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.6630s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6547s - [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7004s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6920s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.73E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1706s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1626s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1752s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1674s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3655s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2764s - [COUNTERS] Fortran MEs ( 1 ) : 0.0891s for 90112 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3760s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s + [COUNTERS] Fortran MEs ( 1 ) : 0.0872s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382700437610044E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382703205998396E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1794s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1727s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0067s for 8192 events => throughput is 1.22E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1795s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1733s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700437610044E-002) differ by less than 4E-4 (1.6027646465577305e-07) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382703205998396E-002) differ by less than 4E-4 (1.306308462512007e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515587669165246E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515590123565249E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3531s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2813s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0717s for 90112 events => throughput is 1.26E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3578s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2889s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0690s for 90112 events => throughput is 1.31E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587669165246E-002) differ by less than 4E-4 (1.568129937012941e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515590123565249E-002) differ by less than 4E-4 (1.2999352305698153e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.212872e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.296058e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.231007e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.289423e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1667s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1641s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.16E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1759s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1734s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3047s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2770s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0277s for 90112 events => throughput is 3.25E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3141s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0281s for 90112 events => throughput is 3.21E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587612890761E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.211664e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.247103e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.224682e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.346461e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1660s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1637s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.51E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1759s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1735s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.49E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3027s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2781s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 90112 events => throughput is 3.66E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3181s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 90112 events => throughput is 3.50E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.652051e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.473027e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.835303e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.779574e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1668s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1648s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1764s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1741s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3014s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2784s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0231s for 90112 events => throughput is 3.90E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2887s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0247s for 90112 events => throughput is 3.65E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.831859e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.393313e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.986193e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.850238e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382704356154977E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382704335459282E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1670s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1647s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.71E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1726s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.46E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382704356154977E-002) differ by less than 4E-4 (1.1831425661412709e-07) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382704335459282E-002) differ by less than 4E-4 (1.1853587900123586e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515591292297929E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515591296252558E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3074s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2840s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 90112 events => throughput is 3.84E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2901s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0255s for 90112 events => throughput is 3.53E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591292297929E-002) differ by less than 4E-4 (1.172226659074127e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591296252558E-002) differ by less than 4E-4 (1.1717945325173673e-07) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.842306e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.340689e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.784479e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.795181e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5797s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5793s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.71E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5865s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5861s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.74E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.6986s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6939s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 90112 events => throughput is 1.93E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7024s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 1.98E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -560,43 +560,43 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515592892887687E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.684998e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.528794e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.020092e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.178202e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.026150e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.848804e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.045722e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.051133e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.017663e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.014035e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.235265e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.222690e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.404519e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.412951e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.457162e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.409232e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 41ed20187f..da7367ae5e 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none - +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-02-05_22:18:32 +DATE: 2024-03-01_03:36:01 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.6567s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6485s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7189s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7106s + [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1705s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1622s - [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1780s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1693s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3615s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2738s - [COUNTERS] Fortran MEs ( 1 ) : 0.0876s for 90112 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3702s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2823s + [COUNTERS] Fortran MEs ( 1 ) : 0.0879s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715420701395E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1758s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1688s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1808s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1742s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.7176438049659737e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701395E-002) differ by less than 2E-4 (1.7176482458580722e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3634s + [COUNTERS] PROGRAM TOTAL : 0.3592s [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0774s for 90112 events => throughput is 1.16E+06 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0732s for 90112 events => throughput is 1.23E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.142475e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.182030e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.160468e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.222787e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1706s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1667s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0038s for 8192 events => throughput is 2.13E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1746s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.07E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3261s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2820s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 90112 events => throughput is 2.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 90112 events => throughput is 2.14E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.009384e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.086150e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.109517e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.131619e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1690s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1660s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.70E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1696s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.76E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3099s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2764s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0335s for 90112 events => throughput is 2.69E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3229s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2892s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0338s for 90112 events => throughput is 2.67E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.614910e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.541763e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.770588e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.454900e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1663s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1635s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0028s for 8192 events => throughput is 2.93E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1830s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1799s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.61E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3312s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2971s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0342s for 90112 events => throughput is 2.64E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3167s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2848s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 90112 events => throughput is 2.83E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.670770e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.677035e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.913155e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.872617e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1780s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1746s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1722s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.36E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3388s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2997s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0390s for 90112 events => throughput is 2.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3270s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2890s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0380s for 90112 events => throughput is 2.37E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,14 +484,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.342746e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.248118e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.449709e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.400436e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5833s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5828s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.63E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5902s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5897s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.66E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602021089631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.6969s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6920s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7064s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7016s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.87E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -560,43 +560,43 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602021089631E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.201225e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.153365e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.975595e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922960e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.723786e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.732117e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.507428e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.451486e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.698364e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.736678e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.060413e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.069247e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.736917e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.733211e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.170942e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.156375e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 204bcc3c17..657075d34f 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-02-05_22:18:49 +DATE: 2024-03-01_03:36:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.7649s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7239s - [COUNTERS] Fortran MEs ( 1 ) : 0.0410s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8052s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7640s + [COUNTERS] Fortran MEs ( 1 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,8 +84,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3910s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3503s + [COUNTERS] PROGRAM TOTAL : 0.3849s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3442s [COUNTERS] Fortran MEs ( 1 ) : 0.0408s for 8192 events => throughput is 2.01E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=0 + [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7562s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3060s - [COUNTERS] Fortran MEs ( 1 ) : 0.4502s for 90112 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6297s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1833s + [COUNTERS] Fortran MEs ( 1 ) : 0.4464s for 90112 events => throughput is 2.02E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4206s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3840s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4282s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3910s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0373s for 8192 events => throughput is 2.20E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,27 +167,27 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7084s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3082s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4002s for 90112 events => throughput is 2.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6901s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4066s for 90112 events => throughput is 2.22E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.211296e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.207121e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.248534e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.224007e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3938s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3718s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3927s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 8192 events => throughput is 3.78E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5214s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2881s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2333s for 90112 events => throughput is 3.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4997s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2628s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2369s for 90112 events => throughput is 3.80E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989106) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989106) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.878640e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.699229e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.806741e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.772412e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3752s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3621s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0131s for 8192 events => throughput is 6.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.20E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4310s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2853s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1457s for 90112 events => throughput is 6.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4003s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2548s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1455s for 90112 events => throughput is 6.19E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.965700e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.020313e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.009550e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.141769e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3743s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3628s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0115s for 8192 events => throughput is 7.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3737s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3616s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.75E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4335s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3051s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1284s for 90112 events => throughput is 7.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3804s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2520s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1283s for 90112 events => throughput is 7.02E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.031412e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.898875e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.139328e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.924828e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3843s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3667s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0176s for 8192 events => throughput is 4.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3919s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3726s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 8192 events => throughput is 4.24E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4856s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2935s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1921s for 90112 events => throughput is 4.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5267s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3034s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2233s for 90112 events => throughput is 4.04E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.481319e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.791161e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.550589e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.782832e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,8 +514,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7810s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7804s + [COUNTERS] PROGRAM TOTAL : 0.7828s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7823s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,56 +547,56 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7200s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7136s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6782s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6718s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279989121) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.064090e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.045663e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.746111e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.714246e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.003251e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.010596e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.076004e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071675e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.006674e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.000853e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.154292e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152555e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.001053e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.001515e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.043495e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.100234e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 5482dc0552..eb011c6697 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 + +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-02-05_22:19:16 +DATE: 2024-03-01_03:36:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.7588s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7181s - [COUNTERS] Fortran MEs ( 1 ) : 0.0407s for 8192 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7779s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7373s + [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3951s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3530s - [COUNTERS] Fortran MEs ( 1 ) : 0.0421s for 8192 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3479s + [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=0 + [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7598s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3088s - [COUNTERS] Fortran MEs ( 1 ) : 0.4511s for 90112 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6449s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1962s + [COUNTERS] Fortran MEs ( 1 ) : 0.4487s for 90112 events => throughput is 2.01E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094177233089695] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094179780921394] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4164s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3823s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0341s for 8192 events => throughput is 2.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4205s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0345s for 8192 events => throughput is 2.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094177233089695) differ by less than 4E-4 (1.6075587627728538e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094179780921394) differ by less than 4E-4 (1.0665510541407741e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105686104543288] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105688579298537] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 2.2170s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8286s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3884s for 90112 events => throughput is 2.32E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6592s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2787s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3805s for 90112 events => throughput is 2.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105686104543288) differ by less than 4E-4 (1.9478421364738097e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105688579298537) differ by less than 4E-4 (1.4224799227413598e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.309794e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.351307e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.307910e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.338637e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094173275857273] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094175850060040] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3765s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3624s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0141s for 8192 events => throughput is 5.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3859s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0148s for 8192 events => throughput is 5.54E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094173275857273) differ by less than 4E-4 (2.447839242414318e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094175850060040) differ by less than 4E-4 (1.9012318908107062e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105682058834830] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684763984058] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4327s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2782s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1545s for 90112 events => throughput is 5.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4203s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2581s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1622s for 90112 events => throughput is 5.56E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105682058834830) differ by less than 4E-4 (2.8066997403985994e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684763984058) differ by less than 4E-4 (2.2324275217311396e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.643949e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.210465e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.734827e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.317035e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094171343713690] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3633s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3557s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.07E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3679s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3602s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094171343713690) differ by less than 4E-4 (2.8581114641657024e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094173652938650) differ by less than 4E-4 (2.3677696170398832e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105681519092386] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3622s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2772s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0850s for 90112 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3368s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2515s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0853s for 90112 events => throughput is 1.06E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105681519092386) differ by less than 4E-4 (2.9212808838607884e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ by less than 4E-4 (2.384278946498952e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.045248e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.038889e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.049590e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.040818e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094171343713690] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3614s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3544s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.17E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3685s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3613s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.13E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094171343713690) differ by less than 4E-4 (2.8581114641657024e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094173652938650) differ by less than 4E-4 (2.3677696170398832e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105681519092386] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3569s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2795s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0774s for 90112 events => throughput is 1.16E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3377s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2584s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0793s for 90112 events => throughput is 1.14E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105681519092386) differ by less than 4E-4 (2.9212808838607884e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ by less than 4E-4 (2.384278946498952e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.147039e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.104729e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.161132e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.124265e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094178385820562] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094178213275804] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3686s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3588s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0098s for 8192 events => throughput is 8.36E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3621s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0105s for 8192 events => throughput is 7.77E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094178385820562) differ by less than 4E-4 (1.3627873807209312e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094178213275804) differ by less than 4E-4 (1.3994256109484127e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105688391077187] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105688407939567] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3858s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2782s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1076s for 90112 events => throughput is 8.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3760s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2629s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 90112 events => throughput is 7.97E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105688391077187) differ by less than 4E-4 (1.46243715803962e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105688407939567) differ by less than 4E-4 (1.4588574703822133e-07) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.950723e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.591310e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.149487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.407728e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184344050284] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7720s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7714s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.51E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7795s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7789s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.48E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,56 +547,56 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105694586476879] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6987s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6934s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 90112 events => throughput is 1.68E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7005s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6948s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0057s for 90112 events => throughput is 1.57E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105694586476879) differ by less than 4E-4 (1.4722470687011935e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105694586476879) differ by less than 4E-4 (1.4722471020078842e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.301322e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.201563e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.982927e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.986974e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.833883e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.810580e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.787018e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.774762e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.833466e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.802177e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.876783e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.847890e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.365328e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.368745e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.354802e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.422351e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 45fc68aa32..bef66309f6 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-02-05_22:19:42 +DATE: 2024-03-01_03:37:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.7623s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7212s - [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7917s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7505s + [COUNTERS] Fortran MEs ( 1 ) : 0.0413s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3907s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3500s - [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3956s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] Fortran MEs ( 1 ) : 0.0410s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=0 + [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7512s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3026s - [COUNTERS] Fortran MEs ( 1 ) : 0.4486s for 90112 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6496s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1993s + [COUNTERS] Fortran MEs ( 1 ) : 0.4503s for 90112 events => throughput is 2.00E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186141863887] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4230s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3861s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 8192 events => throughput is 2.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3880s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0377s for 8192 events => throughput is 2.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863887) differ by less than 2E-4 (2.841342827686333e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428720952538e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -167,27 +167,27 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7095s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3034s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4061s for 90112 events => throughput is 2.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6990s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2867s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4123s for 90112 events => throughput is 2.19E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105696630006634) differ by less than 2E-4 (2.8659327133695456e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006634) differ by less than 2E-4 (2.865932691165085e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.202321e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.185122e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.194899e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.177902e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186141863887] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3881s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3677s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0204s for 8192 events => throughput is 4.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3989s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3779s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0210s for 8192 events => throughput is 3.91E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863887) differ by less than 2E-4 (2.841342827686333e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428720952538e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5147s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2899s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2248s for 90112 events => throughput is 4.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5038s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2721s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2317s for 90112 events => throughput is 3.89E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105696630006626) differ by less than 2E-4 (2.8659327133695456e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006626) differ by less than 2E-4 (2.8659326689606246e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.870020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.744718e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.915241e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.796645e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186193208813] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3767s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3634s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3789s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3661s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.40E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094186193208813) differ by less than 2E-4 (2.950368882537191e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696667630845] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4274s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2825s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1450s for 90112 events => throughput is 6.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4021s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2588s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1433s for 90112 events => throughput is 6.29E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105696667630845) differ by less than 2E-4 (2.945804622456194e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.250062e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.012402e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.098829e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.056070e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186193208813] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.5095s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4973s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3770s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3652s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0118s for 8192 events => throughput is 6.94E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094186193208813) differ by less than 2E-4 (2.950368882537191e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696667630845] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4106s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2855s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1252s for 90112 events => throughput is 7.20E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3844s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2567s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1278s for 90112 events => throughput is 7.05E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105696667630845) differ by less than 2E-4 (2.945804622456194e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.075524e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.957699e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.434399e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.976096e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186193208813] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3845s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3677s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0168s for 8192 events => throughput is 4.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3910s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3720s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0190s for 8192 events => throughput is 4.32E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094186193208813) differ by less than 2E-4 (2.950368882537191e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696667630845] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4742s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2875s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1868s for 90112 events => throughput is 4.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4767s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2708s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2059s for 90112 events => throughput is 4.38E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105696667630845) differ by less than 2E-4 (2.945804622456194e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.593121e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.223304e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.524976e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.269412e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7749s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7743s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7843s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7837s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.37E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,56 +547,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7352s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7287s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.38E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6837s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6773s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.40E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989099) and cpp (47.105695279068492) differ by less than 2E-4 (1.9543477947081556e-11) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279068492) differ by less than 2E-4 (1.954369999168648e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.068039e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.090244e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.702432e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.672934e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.012365e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.997070e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.064674e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055834e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.996044e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.991192e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.147754e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.134835e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.022142e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.012024e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.000250e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.999333e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d12de30a22..cd3823dd45 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-02-05_22:20:10 +DATE: 2024-03-01_03:37:39 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6745s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3479s - [COUNTERS] Fortran MEs ( 1 ) : 0.3266s for 8192 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6990s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3830s + [COUNTERS] Fortran MEs ( 1 ) : 0.3160s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6326s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3084s - [COUNTERS] Fortran MEs ( 1 ) : 0.3242s for 8192 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3119s + [COUNTERS] Fortran MEs ( 1 ) : 0.3167s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.0816s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5148s - [COUNTERS] Fortran MEs ( 1 ) : 3.5669s for 90112 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.9846s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4835s + [COUNTERS] Fortran MEs ( 1 ) : 3.5010s for 90112 events => throughput is 2.57E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.9448s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6238s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3210s for 8192 events => throughput is 2.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0129s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6673s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3457s for 8192 events => throughput is 2.37E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749111) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717694E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.3214s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8115s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5099s for 90112 events => throughput is 2.57E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3959s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8159s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5801s for 90112 events => throughput is 2.52E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717694E-002) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002) differ by less than 3E-14 (6.661338147750939e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.636181e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.608629e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.640749e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.585600e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6387s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4712s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1675s for 8192 events => throughput is 4.89E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6456s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4771s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1686s for 8192 events => throughput is 4.86E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717680E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.5511s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7126s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8385s for 90112 events => throughput is 4.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4996s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6435s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8561s for 90112 events => throughput is 4.85E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717680E-002) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.072602e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.966202e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.077354e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.937901e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4731s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3897s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0833s for 8192 events => throughput is 9.83E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4803s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3957s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0846s for 8192 events => throughput is 9.68E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5092s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5875s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9217s for 90112 events => throughput is 9.78E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5052s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5758s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9294s for 90112 events => throughput is 9.70E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.006722e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.913999e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.926199e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.849874e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4548s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3817s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0731s for 8192 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4790s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3971s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0818s for 8192 events => throughput is 1.00E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.3873s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5789s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8085s for 90112 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3870s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5613s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8257s for 90112 events => throughput is 1.09E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.125594e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.099230e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.143588e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.125635e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5051s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4064s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0987s for 8192 events => throughput is 8.30E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4199s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1084s for 8192 events => throughput is 7.55E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.7499s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6373s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1127s for 90112 events => throughput is 8.10E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7535s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5980s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1555s for 90112 events => throughput is 7.80E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,14 +484,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.367410e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.774058e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.383766e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.841638e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7445s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7391s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.52E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7462s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7408s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749111) differ by less than 3E-14 (0.0) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238481932717736E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9506s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9280s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0226s for 90112 events => throughput is 3.99E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.9272s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9044s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717736E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.632699e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.632538e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.198067e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.097542e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.687085e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.673182e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.245374e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.241730e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.652855e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.666883e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.254650e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.250394e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.685444e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.680746e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.763638e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.758368e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 6acca50600..b22193f403 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-02-05_22:20:53 +DATE: 2024-03-01_03:38:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6668s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3423s - [COUNTERS] Fortran MEs ( 1 ) : 0.3245s for 8192 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6628s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3454s + [COUNTERS] Fortran MEs ( 1 ) : 0.3174s for 8192 events => throughput is 2.58E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6365s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3088s - [COUNTERS] Fortran MEs ( 1 ) : 0.3277s for 8192 events => throughput is 2.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3130s + [COUNTERS] Fortran MEs ( 1 ) : 0.3165s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.1044s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5176s - [COUNTERS] Fortran MEs ( 1 ) : 3.5868s for 90112 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.9722s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s + [COUNTERS] Fortran MEs ( 1 ) : 3.4891s for 90112 events => throughput is 2.58E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112722327776243] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112722621426752] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.9147s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6082s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3065s for 8192 events => throughput is 2.67E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9336s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6187s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3149s for 8192 events => throughput is 2.60E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112722327776243) differ by less than 4E-4 (2.5986973362090993e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112722621426752) differ by less than 4E-4 (2.569659680817793e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238466406484034E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238468310179624E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.1640s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8215s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.3425s for 90112 events => throughput is 2.70E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3385s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8107s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5278s for 90112 events => throughput is 2.55E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238466406484034E-002) differ by less than 4E-4 (1.9594309874637617e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238468310179624E-002) differ by less than 4E-4 (1.719182115555995e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.825867e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.649087e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.822029e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.678753e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112720218188545] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112720710186394] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4892s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3982s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0910s for 8192 events => throughput is 9.00E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4987s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4044s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0944s for 8192 events => throughput is 8.68E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112720218188545) differ by less than 4E-4 (2.8073040938547678e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112720710186394) differ by less than 4E-4 (2.758652844936371e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238450523404405E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238454786658835E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5887s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5903s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9985s for 90112 events => throughput is 9.02E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5977s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5622s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0354s for 90112 events => throughput is 8.70E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238450523404405E-002) differ by less than 4E-4 (3.9638963988952725e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238454786658835E-002) differ by less than 4E-4 (3.4258681169685445e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.157934e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.791493e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.113656e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.818254e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112721286411488] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3952s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3523s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0429s for 8192 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4003s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3576s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721286411488) differ by less than 4E-4 (2.701672777827291e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721766950902) differ by less than 4E-4 (2.654154597325764e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238449434208005E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0166s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5446s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4720s for 90112 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9911s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4755s for 90112 events => throughput is 1.90E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238449434208005E-002) differ by less than 4E-4 (4.101354408314606e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002) differ by less than 4E-4 (3.5585866953180556e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.923701e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.915431e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.922908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.928091e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112721286411488] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3835s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3461s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0374s for 8192 events => throughput is 2.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3910s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3509s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0400s for 8192 events => throughput is 2.05E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721286411488) differ by less than 4E-4 (2.701672777827291e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721766950902) differ by less than 4E-4 (2.654154597325764e-06) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238449434208005E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9524s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5375s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4149s for 90112 events => throughput is 2.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9540s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5218s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4322s for 90112 events => throughput is 2.09E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238449434208005E-002) differ by less than 4E-4 (4.101354408314606e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002) differ by less than 4E-4 (3.5585866953180556e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.197542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.113903e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.185435e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.128293e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112723411062496] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112723387847480] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4079s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3585s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0494s for 8192 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3658s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0516s for 8192 events => throughput is 1.59E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112723411062496) differ by less than 4E-4 (2.491576483576452e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112723387847480) differ by less than 4E-4 (2.4938721023826105e-06) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238464401552092E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238464410949921E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0986s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5551s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5435s for 90112 events => throughput is 1.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0938s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5607s for 90112 events => throughput is 1.61E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464401552092E-002) differ by less than 4E-4 (2.2124560195013743e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464410949921E-002) differ by less than 4E-4 (2.211270000440635e-07) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.635897e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.580486e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.640737e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.544942e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112726034625695] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112726034625694] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7361s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7353s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.77E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7476s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7467s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.72E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112726034625695) differ by less than 4E-4 (2.2321452151086163e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112726034625694) differ by less than 4E-4 (2.2321452152196386e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238473828077680E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9356s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9261s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 90112 events => throughput is 9.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.9141s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9047s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 90112 events => throughput is 9.57E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -560,43 +560,43 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238473828077680E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.324054e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.317603e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.861547e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.855249e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.639674e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.653705e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.427401e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.471958e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.660976e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.666794e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.534637e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.507869e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.507023e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.515295e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.629083e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.625829e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index efef9126ba..994bc4f8f2 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-02-05_22:21:32 +DATE: 2024-03-01_03:39:00 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6659s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3410s - [COUNTERS] Fortran MEs ( 1 ) : 0.3248s for 8192 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3573s + [COUNTERS] Fortran MEs ( 1 ) : 0.3178s for 8192 events => throughput is 2.58E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6286s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3078s - [COUNTERS] Fortran MEs ( 1 ) : 0.3208s for 8192 events => throughput is 2.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3205s + [COUNTERS] Fortran MEs ( 1 ) : 0.3183s for 8192 events => throughput is 2.57E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.1015s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5259s - [COUNTERS] Fortran MEs ( 1 ) : 3.5756s for 90112 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0099s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5073s + [COUNTERS] Fortran MEs ( 1 ) : 3.5026s for 90112 events => throughput is 2.57E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 1.0134s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6664s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3470s for 8192 events => throughput is 2.36E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9635s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6336s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3299s for 8192 events => throughput is 2.48E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.4987s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8629s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6359s for 90112 events => throughput is 2.48E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.4154s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7958s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.6196s for 90112 events => throughput is 2.49E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482679400354E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.570856e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.562106e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.544366e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.547562e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748702805031] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748702805033] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6336s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4688s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1649s for 8192 events => throughput is 4.97E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6394s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1654s for 8192 events => throughput is 4.95E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748702805031) differ by less than 2E-4 (9.399612643790078e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748702805033) differ by less than 2E-4 (9.399612865834683e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482683055653E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238482683055667E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.4637s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6640s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.7997s for 90112 events => throughput is 5.01E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4743s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6457s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8285s for 90112 events => throughput is 4.93E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482683055653E-002) differ by less than 2E-4 (9.469362849401364e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482683055667E-002) differ by less than 2E-4 (9.469362849401364e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.095748e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.063467e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.111417e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.051938e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748681415583] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) [COUNTERS] PROGRAM TOTAL : 0.4771s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3934s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0837s for 8192 events => throughput is 9.79E+04 events/s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3935s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 8192 events => throughput is 9.80E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415583) differ by less than 2E-4 (7.284515213257237e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415580) differ by less than 2E-4 (7.284514991212632e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482534347218E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.4900s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5765s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9135s for 90112 events => throughput is 9.86E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.4752s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5541s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9211s for 90112 events => throughput is 9.78E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347218E-002) differ by less than 2E-4 (7.592642958798024e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002) differ by less than 2E-4 (7.592642958798024e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.013857e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.001861e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.015949e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.840887e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748681415583] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4476s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3758s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0718s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4581s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3846s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0735s for 8192 events => throughput is 1.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415583) differ by less than 2E-4 (7.284515213257237e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415580) differ by less than 2E-4 (7.284514991212632e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482534347218E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.3555s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5646s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7909s for 90112 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3626s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5499s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8127s for 90112 events => throughput is 1.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347218E-002) differ by less than 2E-4 (7.592642958798024e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002) differ by less than 2E-4 (7.592642958798024e-09) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.152174e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.148207e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.139304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152825e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748681415583] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748700265108] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5114s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4097s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1017s for 8192 events => throughput is 8.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5403s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1136s for 8192 events => throughput is 7.21E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415583) differ by less than 2E-4 (7.284515213257237e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748700265108) differ by less than 2E-4 (9.148451995955043e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482534347218E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.07924 [7.9238482666076374E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.7719s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6293s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1426s for 90112 events => throughput is 7.89E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7973s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6097s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1876s for 90112 events => throughput is 7.59E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347218E-002) differ by less than 2E-4 (7.592642958798024e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482666076374E-002) differ by less than 2E-4 (9.255082034087536e-09) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.151380e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.609614e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.222415e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.592843e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7428s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7373s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7459s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7405s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9444s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9218s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0226s for 90112 events => throughput is 3.99E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.9191s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8964s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.96E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -560,43 +560,43 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481937154381E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.637764e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.624489e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.311283e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.862423e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.607274e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.598562e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.233492e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.230160e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.634483e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.604858e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.243748e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.241022e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.619987e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.618302e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.709088e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.712384e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 42f6d38589..455a867420 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-02-05_22:22:15 +DATE: 2024-03-01_03:39:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.4979s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3422s - [COUNTERS] Fortran MEs ( 1 ) : 4.1556s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5262s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3968s + [COUNTERS] Fortran MEs ( 1 ) : 4.1295s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.4698s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3331s - [COUNTERS] Fortran MEs ( 1 ) : 4.1367s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4601s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3420s + [COUNTERS] Fortran MEs ( 1 ) : 4.1180s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 47.6993s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0438s - [COUNTERS] Fortran MEs ( 1 ) : 45.6555s for 90112 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.7126s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0517s + [COUNTERS] Fortran MEs ( 1 ) : 45.6608s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,13 +134,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 8.9166s - [COUNTERS] Fortran Overhead ( 0 ) : 4.5742s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3424s for 8192 events => throughput is 1.89E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7056s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4601s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2455s for 8192 events => throughput is 1.93E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102372E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102372E-004) differ by less than 3E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451701E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 54.1949s - [COUNTERS] Fortran Overhead ( 0 ) : 6.2939s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.9009s for 90112 events => throughput is 1.88E+03 events/s + [COUNTERS] PROGRAM TOTAL : 53.1561s + [COUNTERS] Fortran Overhead ( 0 ) : 6.1171s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.0390s for 90112 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451701E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451704E-004) differ by less than 3E-14 (5.551115123125783e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.927146e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.989312e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.934290e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.975004e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.7446s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5173s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2273s for 8192 events => throughput is 3.68E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7773s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5170s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2604s for 8192 events => throughput is 3.62E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102372E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793438642451701E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 28.9914s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2638s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.7275s for 90112 events => throughput is 3.64E+03 events/s + [COUNTERS] PROGRAM TOTAL : 29.0103s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1559s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.8544s for 90112 events => throughput is 3.63E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451704E-004) differ by less than 3E-14 (0.0) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451701E-004) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.808325e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.801009e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.810712e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.781734e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,13 +286,13 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.2692s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2960s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9732s for 8192 events => throughput is 8.42E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.2569s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2848s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9721s for 8192 events => throughput is 8.43E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451707E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 13.7356s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0257s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.7099s for 90112 events => throughput is 8.41E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.7501s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9426s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.8075s for 90112 events => throughput is 8.34E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451707E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.639623e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.607758e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.677492e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.615061e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,13 +362,13 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.0280s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1744s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8536s for 8192 events => throughput is 9.60E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.0290s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8583s for 8192 events => throughput is 9.54E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451707E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 12.2880s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8878s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4002s for 90112 events => throughput is 9.59E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.2922s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8302s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4620s for 90112 events => throughput is 9.52E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451707E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.851640e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.701965e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.826048e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.814187e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,13 +438,13 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.4467s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3864s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0603s for 8192 events => throughput is 7.73E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5040s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4024s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1017s for 8192 events => throughput is 7.44E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451707E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 14.9115s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1698s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.7417s for 90112 events => throughput is 7.67E+03 events/s + [COUNTERS] PROGRAM TOTAL : 14.7910s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0412s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.7498s for 90112 events => throughput is 7.67E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451707E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.829236e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.831586e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.877772e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.821061e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,13 +514,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8732s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8402s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8686s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8368s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.57E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.9273s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5680s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3593s for 90112 events => throughput is 2.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.8233s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4732s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3501s for 90112 events => throughput is 2.57E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.297744e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.280922e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.523615e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.518844e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.102829e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.106750e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.140866e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.162850e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.114484e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.106625e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.173422e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.168282e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.119855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.107369e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.426650e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.430988e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 89070eac21..5e945a4db8 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-02-05_22:26:31 +DATE: 2024-03-01_03:43:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.4620s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3366s - [COUNTERS] Fortran MEs ( 1 ) : 4.1254s for 8192 events => throughput is 1.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4989s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3403s + [COUNTERS] Fortran MEs ( 1 ) : 4.1586s for 8192 events => throughput is 1.97E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.4826s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3337s - [COUNTERS] Fortran MEs ( 1 ) : 4.1489s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4650s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3366s + [COUNTERS] Fortran MEs ( 1 ) : 4.1284s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 47.7647s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0487s - [COUNTERS] Fortran MEs ( 1 ) : 45.7160s for 90112 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.5707s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0343s + [COUNTERS] Fortran MEs ( 1 ) : 45.5364s for 90112 events => throughput is 1.98E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703728935895570E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703729438336302E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 8.1948s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2129s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.9819s for 8192 events => throughput is 2.06E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.4568s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3045s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1523s for 8192 events => throughput is 1.97E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703728935895570E-004) differ by less than 4E-4 (3.0081376303225937e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703729438336302E-004) differ by less than 4E-4 (3.021119383106452e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793486223749466E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793486626492658E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 49.9434s - [COUNTERS] Fortran Overhead ( 0 ) : 5.9464s - [COUNTERS] CudaCpp MEs ( 2 ) : 43.9970s for 90112 events => throughput is 2.05E+03 events/s + [COUNTERS] PROGRAM TOTAL : 51.1261s + [COUNTERS] Fortran Overhead ( 0 ) : 5.9844s + [COUNTERS] CudaCpp MEs ( 2 ) : 45.1417s for 90112 events => throughput is 2.00E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793486223749466E-004) differ by less than 4E-4 (3.0127256538392544e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486626492658E-004) differ by less than 4E-4 (3.0382263187522796e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.118265e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.070377e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.118721e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.032691e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703721162664038E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703722581317850E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.5421s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4276s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1145s for 8192 events => throughput is 7.35E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5531s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4379s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1153s for 8192 events => throughput is 7.35E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703721162664038E-004) differ by less than 4E-4 (2.8072976823168005e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722581317850E-004) differ by less than 4E-4 (2.843951981690296e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793482900053113E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793483759856148E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 15.5186s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1615s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.3571s for 90112 events => throughput is 7.29E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.4011s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1124s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.2887s for 90112 events => throughput is 7.33E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793482900053113E-004) differ by less than 4E-4 (2.8022777314173908e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483759856148E-004) differ by less than 4E-4 (2.856718252175483e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.661702e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.468143e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.676226e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.493623e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703719746039955E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.3053s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8175s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4878s for 8192 events => throughput is 1.68E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3122s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8184s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4938s for 8192 events => throughput is 1.66E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703719746039955E-004) differ by less than 4E-4 (2.7706958254380964e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722425602170E-004) differ by less than 4E-4 (2.8399286962077497e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793482744283897E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 7.9508s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5496s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4012s for 90112 events => throughput is 1.67E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.8863s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4589s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4274s for 90112 events => throughput is 1.66E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793482744283897E-004) differ by less than 4E-4 (2.7924148244817815e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004) differ by less than 4E-4 (2.852825495613942e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.722931e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.689224e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.655694e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.712522e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703719746039955E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.1890s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7577s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4314s for 8192 events => throughput is 1.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.1887s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7547s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4340s for 8192 events => throughput is 1.89E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703719746039955E-004) differ by less than 4E-4 (2.7706958254380964e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722425602170E-004) differ by less than 4E-4 (2.8399286962077497e-06) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793482744283897E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 7.1796s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4657s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.7139s for 90112 events => throughput is 1.91E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.2113s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4166s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7946s for 90112 events => throughput is 1.88E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793482744283897E-004) differ by less than 4E-4 (2.7924148244817815e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004) differ by less than 4E-4 (2.852825495613942e-06) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.965165e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.812765e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.961063e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.800388e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703728656142196E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703728658657426E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.3838s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8569s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5269s for 8192 events => throughput is 1.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.4119s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8827s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5292s for 8192 events => throughput is 1.55E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703728656142196E-004) differ by less than 4E-4 (3.0009095357552695e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703728658657426E-004) differ by less than 4E-4 (3.0009745224379714e-06) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793486988396928E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793486977281547E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 8.3413s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5676s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.7737s for 90112 events => throughput is 1.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.3753s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5229s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.8525s for 90112 events => throughput is 1.54E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793486988396928E-004) differ by less than 4E-4 (3.0611411687697654e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486977281547E-004) differ by less than 4E-4 (3.0604373708609245e-06) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.563090e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.556546e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.584923e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565832e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,13 +514,13 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703736267486325E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8348s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8135s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0213s for 8192 events => throughput is 3.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8334s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8120s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.82E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703736267486325E-004) differ by less than 4E-4 (3.197566737389579e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703736267486325E-004) differ by less than 4E-4 (3.1975667371675343e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793489323670813E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.7662s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5323s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2339s for 90112 events => throughput is 3.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7017s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4654s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2363s for 90112 events => throughput is 3.81E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793489323670813E-004) differ by less than 4E-4 (3.2090047175081793e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793489323670813E-004) differ by less than 4E-4 (3.20900471706409e-06) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.598573e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.592263e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.958409e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.940482e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.495046e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.499807e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.667956e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.638317e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.492157e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.497540e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.729344e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.635301e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.485281e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.483569e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.532672e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.518477e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index a9f0e03001..4a1ef98d00 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' @@ -10,33 +10,33 @@ make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-02-05_22:29:50 +DATE: 2024-03-01_03:47:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.4716s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3356s - [COUNTERS] Fortran MEs ( 1 ) : 4.1360s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4720s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3418s + [COUNTERS] Fortran MEs ( 1 ) : 4.1302s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102345E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.4821s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3396s - [COUNTERS] Fortran MEs ( 1 ) : 4.1425s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3357s + [COUNTERS] Fortran MEs ( 1 ) : 4.1229s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 48.1896s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0584s - [COUNTERS] Fortran MEs ( 1 ) : 46.1312s for 90112 events => throughput is 1.95E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.6222s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0494s + [COUNTERS] Fortran MEs ( 1 ) : 45.5728s for 90112 events => throughput is 1.98E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612659176647E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612659176674E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 9.0456s - [COUNTERS] Fortran Overhead ( 0 ) : 4.6365s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4091s for 8192 events => throughput is 1.86E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7912s + [COUNTERS] Fortran Overhead ( 0 ) : 4.5114s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2799s for 8192 events => throughput is 1.91E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612659176647E-004) differ by less than 2E-4 (3.851689633904698e-09) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612659176674E-004) differ by less than 2E-4 (3.851690077993908e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438704534937E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793438704534934E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 54.9541s - [COUNTERS] Fortran Overhead ( 0 ) : 6.4100s - [COUNTERS] CudaCpp MEs ( 2 ) : 48.5441s for 90112 events => throughput is 1.86E+03 events/s + [COUNTERS] PROGRAM TOTAL : 53.4090s + [COUNTERS] Fortran Overhead ( 0 ) : 6.1734s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.2356s for 90112 events => throughput is 1.91E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438704534937E-004) differ by less than 2E-4 (3.930950898123342e-09) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438704534934E-004) differ by less than 2E-4 (3.930950231989527e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.919374e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.968066e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.902053e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.968245e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612692816692E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612692816703E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.7464s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5234s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2230s for 8192 events => throughput is 3.69E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7232s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5040s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2192s for 8192 events => throughput is 3.69E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612692816692E-004) differ by less than 2E-4 (4.720860369289426e-09) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612692816703E-004) differ by less than 2E-4 (4.720860369289426e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438707226032E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793438707226035E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 28.6832s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2381s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.4451s for 90112 events => throughput is 3.69E+03 events/s + [COUNTERS] PROGRAM TOTAL : 28.6711s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1739s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.4972s for 90112 events => throughput is 3.68E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438707226032E-004) differ by less than 2E-4 (4.101344153184527e-09) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438707226035E-004) differ by less than 2E-4 (4.1013439311399225e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.801229e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.727620e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.806913e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.685802e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240507E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.2352s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2775s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9577s for 8192 events => throughput is 8.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.2625s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2738s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9887s for 8192 events => throughput is 8.29E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612675240507E-004) differ by less than 2E-4 (4.266737629876616e-09) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631772E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 13.6830s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0214s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.6615s for 90112 events => throughput is 8.45E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.6031s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9396s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.6635s for 90112 events => throughput is 8.45E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438703631772E-004) differ by less than 2E-4 (3.873764864437135e-09) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.694548e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.715236e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.713044e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.685374e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240507E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.0177s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1622s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8555s for 8192 events => throughput is 9.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.0253s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1676s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8577s for 8192 events => throughput is 9.55E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612675240507E-004) differ by less than 2E-4 (4.266737629876616e-09) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631772E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 12.2482s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8854s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.3628s for 90112 events => throughput is 9.62E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.2295s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8222s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4074s for 90112 events => throughput is 9.58E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438703631772E-004) differ by less than 2E-4 (3.873764864437135e-09) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.966213e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.886999e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.798303e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.910216e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240507E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.4852s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4051s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0801s for 8192 events => throughput is 7.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.4883s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4086s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0796s for 8192 events => throughput is 7.59E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612675240507E-004) differ by less than 2E-4 (4.266737629876616e-09) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631772E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 14.9006s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1273s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.7733s for 90112 events => throughput is 7.65E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.1764s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0860s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.0904s for 90112 events => throughput is 7.45E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438703631772E-004) differ by less than 2E-4 (3.873764864437135e-09) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.689329e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.643781e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.748625e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.679757e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,13 +514,13 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612512203166E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8748s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8417s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8696s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8376s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0321s for 8192 events => throughput is 2.56E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102345E-004) and cpp (3.8703612512203166E-004) differ by less than 2E-4 (5.4279691852343603e-11) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612512203166E-004) differ by less than 2E-4 (5.427946980773868e-11) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642387715E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0001579 [1.5793438642387717E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.9385s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5754s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3630s for 90112 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.8559s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5071s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3489s for 90112 events => throughput is 2.58E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451704E-004) and cpp (1.5793438642387715E-004) differ by less than 2E-4 (4.051647906067046e-12) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642387717E-004) differ by less than 2E-4 (4.051980972974434e-12) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.285322e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.289596e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.527417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.528638e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.111025e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.112086e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.154637e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.149032e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.103369e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.114551e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.159233e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.167728e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.097040e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109912e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.433644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.430504e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 91a0e957d1..6ba33cd625 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-02-05_22:35:39 +DATE: 2024-03-01_03:53:01 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 95.1303s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4967s - [COUNTERS] Fortran MEs ( 1 ) : 94.6335s for 8192 events => throughput is 8.66E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.0689s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5400s + [COUNTERS] Fortran MEs ( 1 ) : 95.5289s for 8192 events => throughput is 8.58E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 94.7777s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4926s - [COUNTERS] Fortran MEs ( 1 ) : 94.2850s for 8192 events => throughput is 8.69E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.2818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4823s + [COUNTERS] Fortran MEs ( 1 ) : 95.7994s for 8192 events => throughput is 8.55E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086655972E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1046.4939s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3422s - [COUNTERS] Fortran MEs ( 1 ) : 1042.1517s for 90112 events => throughput is 8.65E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1058.3505s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1547s + [COUNTERS] Fortran MEs ( 1 ) : 1054.1958s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,13 +134,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 210.0765s - [COUNTERS] Fortran Overhead ( 0 ) : 97.1032s - [COUNTERS] CudaCpp MEs ( 2 ) : 112.9733s for 8192 events => throughput is 7.25E+01 events/s + [COUNTERS] PROGRAM TOTAL : 212.3366s + [COUNTERS] Fortran Overhead ( 0 ) : 99.0477s + [COUNTERS] CudaCpp MEs ( 2 ) : 113.2889s for 8192 events => throughput is 7.23E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939193E-006) differ by less than 3E-14 (1.3322676295501878e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939193E-006) differ by less than 3E-14 (1.5543122344752192e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1343.2987s - [COUNTERS] Fortran Overhead ( 0 ) : 100.2236s - [COUNTERS] CudaCpp MEs ( 2 ) : 1243.0751s for 90112 events => throughput is 7.25E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1356.0370s + [COUNTERS] Fortran Overhead ( 0 ) : 104.1787s + [COUNTERS] CudaCpp MEs ( 2 ) : 1251.8583s for 90112 events => throughput is 7.20E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.092994e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.154156e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.445665e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.197434e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939197E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 106.7093s - [COUNTERS] Fortran Overhead ( 0 ) : 49.4816s - [COUNTERS] CudaCpp MEs ( 2 ) : 57.2277s for 8192 events => throughput is 1.43E+02 events/s + [COUNTERS] PROGRAM TOTAL : 107.3498s + [COUNTERS] Fortran Overhead ( 0 ) : 49.5738s + [COUNTERS] CudaCpp MEs ( 2 ) : 57.7759s for 8192 events => throughput is 1.42E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939197E-006) differ by less than 3E-14 (1.7763568394002505e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656017E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 686.0903s - [COUNTERS] Fortran Overhead ( 0 ) : 53.2968s - [COUNTERS] CudaCpp MEs ( 2 ) : 632.7935s for 90112 events => throughput is 1.42E+02 events/s + [COUNTERS] PROGRAM TOTAL : 690.9132s + [COUNTERS] Fortran Overhead ( 0 ) : 53.4647s + [COUNTERS] CudaCpp MEs ( 2 ) : 637.4485s for 90112 events => throughput is 1.41E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.7763568394002505e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656017E-007) differ by less than 3E-14 (2.220446049250313e-15) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.657733e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.672791e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.659779e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.670748e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,13 +286,13 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 50.2187s - [COUNTERS] Fortran Overhead ( 0 ) : 23.1972s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.0215s for 8192 events => throughput is 3.03E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.9431s + [COUNTERS] Fortran Overhead ( 0 ) : 23.2154s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.7277s for 8192 events => throughput is 3.06E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.1102230246251565e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656009E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 322.8676s - [COUNTERS] Fortran Overhead ( 0 ) : 26.8768s - [COUNTERS] CudaCpp MEs ( 2 ) : 295.9908s for 90112 events => throughput is 3.04E+02 events/s + [COUNTERS] PROGRAM TOTAL : 318.2044s + [COUNTERS] Fortran Overhead ( 0 ) : 26.8024s + [COUNTERS] CudaCpp MEs ( 2 ) : 291.4019s for 90112 events => throughput is 3.09E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656009E-007) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.617077e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.618074e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.626995e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.618894e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,13 +362,13 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 44.2033s - [COUNTERS] Fortran Overhead ( 0 ) : 20.0984s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.1049s for 8192 events => throughput is 3.40E+02 events/s + [COUNTERS] PROGRAM TOTAL : 44.2064s + [COUNTERS] Fortran Overhead ( 0 ) : 20.3467s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.8597s for 8192 events => throughput is 3.43E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.1102230246251565e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656009E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 287.4746s - [COUNTERS] Fortran Overhead ( 0 ) : 23.9380s - [COUNTERS] CudaCpp MEs ( 2 ) : 263.5366s for 90112 events => throughput is 3.42E+02 events/s + [COUNTERS] PROGRAM TOTAL : 291.1048s + [COUNTERS] Fortran Overhead ( 0 ) : 24.2318s + [COUNTERS] CudaCpp MEs ( 2 ) : 266.8729s for 90112 events => throughput is 3.38E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656009E-007) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.163294e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.097914e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.178912e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.125731e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,13 +438,13 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 45.5276s - [COUNTERS] Fortran Overhead ( 0 ) : 22.2470s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.2806s for 8192 events => throughput is 3.52E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.8566s + [COUNTERS] Fortran Overhead ( 0 ) : 22.2857s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.5710s for 8192 events => throughput is 3.48E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.1102230246251565e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656009E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 280.9380s - [COUNTERS] Fortran Overhead ( 0 ) : 26.0371s - [COUNTERS] CudaCpp MEs ( 2 ) : 254.9009s for 90112 events => throughput is 3.54E+02 events/s + [COUNTERS] PROGRAM TOTAL : 285.1342s + [COUNTERS] Fortran Overhead ( 0 ) : 26.2120s + [COUNTERS] CudaCpp MEs ( 2 ) : 258.9222s for 90112 events => throughput is 3.48E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656009E-007) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.775442e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.725410e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.798228e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.772387e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,13 +514,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 4.2469s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1621s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0848s for 8192 events => throughput is 7.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.2510s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1660s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0850s for 8192 events => throughput is 7.55E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.7763568394002505e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 18.9307s - [COUNTERS] Fortran Overhead ( 0 ) : 7.0189s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9118s for 90112 events => throughput is 7.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 18.8198s + [COUNTERS] Fortran Overhead ( 0 ) : 6.9183s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9015s for 90112 events => throughput is 7.57E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.7763568394002505e-15) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.516506e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.527080e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.276640e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.239391e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.254877e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.271267e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.570497e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.600243e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.248946e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.245889e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.448571e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.476521e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.255913e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.229131e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.238725e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.234312e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 6e7885c855..2b7ca2c190 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-02-06_00:00:45 +DATE: 2024-03-01_05:18:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 95.1870s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4906s - [COUNTERS] Fortran MEs ( 1 ) : 94.6964s for 8192 events => throughput is 8.65E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.8320s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s + [COUNTERS] Fortran MEs ( 1 ) : 96.3581s for 8192 events => throughput is 8.50E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 94.9685s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4953s - [COUNTERS] Fortran MEs ( 1 ) : 94.4732s for 8192 events => throughput is 8.67E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.1294s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4800s + [COUNTERS] Fortran MEs ( 1 ) : 95.6494s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086655972E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1045.4531s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3853s - [COUNTERS] Fortran MEs ( 1 ) : 1041.0679s for 90112 events => throughput is 8.66E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1058.3011s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1783s + [COUNTERS] Fortran MEs ( 1 ) : 1054.1228s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405719498009764E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405719957040752E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 191.0542s - [COUNTERS] Fortran Overhead ( 0 ) : 88.4952s - [COUNTERS] CudaCpp MEs ( 2 ) : 102.5590s for 8192 events => throughput is 7.99E+01 events/s + [COUNTERS] PROGRAM TOTAL : 197.7089s + [COUNTERS] Fortran Overhead ( 0 ) : 90.3714s + [COUNTERS] CudaCpp MEs ( 2 ) : 107.3375s for 8192 events => throughput is 7.63E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405719498009764E-006) differ by less than 4E-4 (0.00013981555433351112) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719957040752E-006) differ by less than 4E-4 (0.00013985256106807675) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326289850060011E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326290771198648E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1218.1212s - [COUNTERS] Fortran Overhead ( 0 ) : 92.3855s - [COUNTERS] CudaCpp MEs ( 2 ) : 1125.7357s for 90112 events => throughput is 8.00E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1274.0074s + [COUNTERS] Fortran Overhead ( 0 ) : 94.0944s + [COUNTERS] CudaCpp MEs ( 2 ) : 1179.9131s for 90112 events => throughput is 7.64E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326289850060011E-007) differ by less than 4E-4 (0.00014135250101854346) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290771198648E-007) differ by less than 4E-4 (0.00014139199589124907) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.234084e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.108865e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.340763e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.128078e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405716133562926E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405717007921116E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 49.0072s - [COUNTERS] Fortran Overhead ( 0 ) : 23.3751s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.6321s for 8192 events => throughput is 3.20E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.6519s + [COUNTERS] Fortran Overhead ( 0 ) : 23.3946s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2573s for 8192 events => throughput is 3.12E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405716133562926E-006) differ by less than 4E-4 (0.0001395443151488429) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405717007921116E-006) differ by less than 4E-4 (0.00013961480525170877) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326283773234128E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326284900828787E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 310.5028s - [COUNTERS] Fortran Overhead ( 0 ) : 26.9907s - [COUNTERS] CudaCpp MEs ( 2 ) : 283.5121s for 90112 events => throughput is 3.18E+02 events/s + [COUNTERS] PROGRAM TOTAL : 315.8806s + [COUNTERS] Fortran Overhead ( 0 ) : 27.1593s + [COUNTERS] CudaCpp MEs ( 2 ) : 288.7213s for 90112 events => throughput is 3.12E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326283773234128E-007) differ by less than 4E-4 (0.00014109195015965525) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326284900828787E-007) differ by less than 4E-4 (0.00014114029707035236) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.609202e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.581780e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.624327e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.565199e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405715853898719E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 25.1152s - [COUNTERS] Fortran Overhead ( 0 ) : 11.8289s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.2863s for 8192 events => throughput is 6.17E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.4788s + [COUNTERS] Fortran Overhead ( 0 ) : 11.8981s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.5807s for 8192 events => throughput is 6.03E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405715853898719E-006) differ by less than 4E-4 (0.00013952176883003098) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716659252656E-006) differ by less than 4E-4 (0.00013958669586155992) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326275792962891E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 162.6923s - [COUNTERS] Fortran Overhead ( 0 ) : 15.8974s - [COUNTERS] CudaCpp MEs ( 2 ) : 146.7949s for 90112 events => throughput is 6.14E+02 events/s + [COUNTERS] PROGRAM TOTAL : 165.7549s + [COUNTERS] Fortran Overhead ( 0 ) : 15.4780s + [COUNTERS] CudaCpp MEs ( 2 ) : 150.2769s for 90112 events => throughput is 6.00E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326275792962891E-007) differ by less than 4E-4 (0.00014074978690437057) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007) differ by less than 4E-4 (0.00014080311959907554) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.205911e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.259920e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.215076e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.259066e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405715853898719E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 22.6386s - [COUNTERS] Fortran Overhead ( 0 ) : 10.6852s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9534s for 8192 events => throughput is 6.85E+02 events/s + [COUNTERS] PROGRAM TOTAL : 22.3180s + [COUNTERS] Fortran Overhead ( 0 ) : 10.3786s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9393s for 8192 events => throughput is 6.86E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405715853898719E-006) differ by less than 4E-4 (0.00013952176883003098) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716659252656E-006) differ by less than 4E-4 (0.00013958669586155992) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326275792962891E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 145.3504s - [COUNTERS] Fortran Overhead ( 0 ) : 14.3703s - [COUNTERS] CudaCpp MEs ( 2 ) : 130.9800s for 90112 events => throughput is 6.88E+02 events/s + [COUNTERS] PROGRAM TOTAL : 145.4310s + [COUNTERS] Fortran Overhead ( 0 ) : 14.1732s + [COUNTERS] CudaCpp MEs ( 2 ) : 131.2578s for 90112 events => throughput is 6.87E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326275792962891E-007) differ by less than 4E-4 (0.00014074978690437057) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007) differ by less than 4E-4 (0.00014080311959907554) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.145261e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.296906e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.089522e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.301383e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405719423038986E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405719306052570E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 22.6291s - [COUNTERS] Fortran Overhead ( 0 ) : 11.3109s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.3183s for 8192 events => throughput is 7.24E+02 events/s + [COUNTERS] PROGRAM TOTAL : 23.0558s + [COUNTERS] Fortran Overhead ( 0 ) : 11.3644s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.6914s for 8192 events => throughput is 7.01E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405719423038986E-006) differ by less than 4E-4 (0.00013980951024539223) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719306052570E-006) differ by less than 4E-4 (0.00013980007888836354) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326283662420285E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326283660088769E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 140.4388s - [COUNTERS] Fortran Overhead ( 0 ) : 15.1102s - [COUNTERS] CudaCpp MEs ( 2 ) : 125.3286s for 90112 events => throughput is 7.19E+02 events/s + [COUNTERS] PROGRAM TOTAL : 144.1559s + [COUNTERS] Fortran Overhead ( 0 ) : 15.2893s + [COUNTERS] CudaCpp MEs ( 2 ) : 128.8666s for 90112 events => throughput is 6.99E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326283662420285E-007) differ by less than 4E-4 (0.00014108719888938914) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326283660088769E-007) differ by less than 4E-4 (0.00014108709892313165) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.551293e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.554413e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.502865e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.557969e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405722175509506E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405722175509512E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 2.5073s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0137s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4936s for 8192 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.4934s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9950s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4985s for 8192 events => throughput is 1.64E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2405722175509506E-006) differ by less than 4E-4 (0.00014003141235763295) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405722175509512E-006) differ by less than 4E-4 (0.00014003141235829908) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326296967941821E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 11.2651s - [COUNTERS] Fortran Overhead ( 0 ) : 5.8307s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4344s for 90112 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 11.1120s + [COUNTERS] Fortran Overhead ( 0 ) : 5.7089s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4031s for 90112 events => throughput is 1.67E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3326296967941821E-007) differ by less than 4E-4 (0.00014165768834106807) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326296967941821E-007) differ by less than 4E-4 (0.0001416576883412901) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.641774e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.650610e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.628707e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.632591e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.317424e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.339184e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.380209e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.373598e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.351711e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.323596e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.398349e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.361104e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.351253e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.325481e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.434374e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.425348e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 5311267b6e..99d7cfbcd5 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,8 +1,8 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 @@ -10,33 +10,33 @@ make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-02-06_01:04:55 +DATE: 2024-03-01_06:24:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 95.2652s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4947s - [COUNTERS] Fortran MEs ( 1 ) : 94.7705s for 8192 events => throughput is 8.64E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.2156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s + [COUNTERS] Fortran MEs ( 1 ) : 95.7357s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939176E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 94.9824s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4952s - [COUNTERS] Fortran MEs ( 1 ) : 94.4872s for 8192 events => throughput is 8.67E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.1318s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s + [COUNTERS] Fortran MEs ( 1 ) : 95.6519s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086655972E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1049.9382s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3738s - [COUNTERS] Fortran MEs ( 1 ) : 1045.5645s for 90112 events => throughput is 8.62E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1057.5728s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1537s + [COUNTERS] Fortran MEs ( 1 ) : 1053.4191s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985299359846E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985299359844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 210.2766s - [COUNTERS] Fortran Overhead ( 0 ) : 96.9104s - [COUNTERS] CudaCpp MEs ( 2 ) : 113.3662s for 8192 events => throughput is 7.23E+01 events/s + [COUNTERS] PROGRAM TOTAL : 220.4361s + [COUNTERS] Fortran Overhead ( 0 ) : 102.4490s + [COUNTERS] CudaCpp MEs ( 2 ) : 117.9870s for 8192 events => throughput is 6.94E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985299359846E-006) differ by less than 2E-4 (5.7578810608305275e-09) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985299359844E-006) differ by less than 2E-4 (5.7578810608305275e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -167,27 +167,27 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1355.4366s - [COUNTERS] Fortran Overhead ( 0 ) : 101.1078s - [COUNTERS] CudaCpp MEs ( 2 ) : 1254.3289s for 90112 events => throughput is 7.18E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1422.8276s + [COUNTERS] Fortran Overhead ( 0 ) : 106.0198s + [COUNTERS] CudaCpp MEs ( 2 ) : 1316.8079s for 90112 events => throughput is 6.84E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993212353001E-007) differ by less than 2E-4 (5.389403812117166e-09) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993212353001E-007) differ by less than 2E-4 (5.389404034161771e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.512949e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.035940e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.521314e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.018960e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985295828473E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985295828471E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 109.5011s - [COUNTERS] Fortran Overhead ( 0 ) : 50.7884s - [COUNTERS] CudaCpp MEs ( 2 ) : 58.7127s for 8192 events => throughput is 1.40E+02 events/s + [COUNTERS] PROGRAM TOTAL : 110.5022s + [COUNTERS] Fortran Overhead ( 0 ) : 50.8167s + [COUNTERS] CudaCpp MEs ( 2 ) : 59.6855s for 8192 events => throughput is 1.37E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985295828473E-006) differ by less than 2E-4 (5.473184350179849e-09) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985295828471E-006) differ by less than 2E-4 (5.473184350179849e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993222645648E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222645653E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 699.7664s - [COUNTERS] Fortran Overhead ( 0 ) : 54.5245s - [COUNTERS] CudaCpp MEs ( 2 ) : 645.2418s for 90112 events => throughput is 1.40E+02 events/s + [COUNTERS] PROGRAM TOTAL : 715.3882s + [COUNTERS] Fortran Overhead ( 0 ) : 54.5501s + [COUNTERS] CudaCpp MEs ( 2 ) : 660.8381s for 90112 events => throughput is 1.36E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222645648E-007) differ by less than 2E-4 (5.8307128014689624e-09) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222645653E-007) differ by less than 2E-4 (5.830713245558172e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.643172e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.628879e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.637966e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.636164e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,13 +286,13 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 47.5676s - [COUNTERS] Fortran Overhead ( 0 ) : 22.0164s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.5512s for 8192 events => throughput is 3.21E+02 events/s + [COUNTERS] PROGRAM TOTAL : 48.5744s + [COUNTERS] Fortran Overhead ( 0 ) : 22.1801s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.3943s for 8192 events => throughput is 3.10E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 305.3712s - [COUNTERS] Fortran Overhead ( 0 ) : 25.6316s - [COUNTERS] CudaCpp MEs ( 2 ) : 279.7396s for 90112 events => throughput is 3.22E+02 events/s + [COUNTERS] PROGRAM TOTAL : 319.2663s + [COUNTERS] Fortran Overhead ( 0 ) : 26.0078s + [COUNTERS] CudaCpp MEs ( 2 ) : 293.2585s for 90112 events => throughput is 3.07E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.82220427425284e-09) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.853680e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.764546e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.839043e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.773101e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,13 +362,13 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 41.7281s - [COUNTERS] Fortran Overhead ( 0 ) : 19.1520s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.5761s for 8192 events => throughput is 3.63E+02 events/s + [COUNTERS] PROGRAM TOTAL : 42.4540s + [COUNTERS] Fortran Overhead ( 0 ) : 19.2743s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.1797s for 8192 events => throughput is 3.53E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 271.0485s - [COUNTERS] Fortran Overhead ( 0 ) : 23.0363s - [COUNTERS] CudaCpp MEs ( 2 ) : 248.0121s for 90112 events => throughput is 3.63E+02 events/s + [COUNTERS] PROGRAM TOTAL : 277.3470s + [COUNTERS] Fortran Overhead ( 0 ) : 22.9193s + [COUNTERS] CudaCpp MEs ( 2 ) : 254.4277s for 90112 events => throughput is 3.54E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.82220427425284e-09) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.395944e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.384820e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.405946e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.391539e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,13 +438,13 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 44.0281s - [COUNTERS] Fortran Overhead ( 0 ) : 21.6063s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.4218s for 8192 events => throughput is 3.65E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.2143s + [COUNTERS] Fortran Overhead ( 0 ) : 21.9553s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.2589s for 8192 events => throughput is 3.52E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 270.4138s - [COUNTERS] Fortran Overhead ( 0 ) : 25.3779s - [COUNTERS] CudaCpp MEs ( 2 ) : 245.0359s for 90112 events => throughput is 3.68E+02 events/s + [COUNTERS] PROGRAM TOTAL : 278.0679s + [COUNTERS] Fortran Overhead ( 0 ) : 25.4000s + [COUNTERS] CudaCpp MEs ( 2 ) : 252.6680s for 90112 events => throughput is 3.57E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.82220427425284e-09) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.938341e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.828727e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.926685e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.858416e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,13 +514,13 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 3.5865s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7235s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8630s for 8192 events => throughput is 9.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5884s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7239s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8645s for 8192 events => throughput is 9.48E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939176E-006) and cpp (1.2403985217419736E-006) differ by less than 2E-4 (8.480693924894922e-10) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985217419736E-006) differ by less than 2E-4 (8.480691704448873e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993078576736E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993078576733E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 16.0732s - [COUNTERS] Fortran Overhead ( 0 ) : 6.5735s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4997s for 90112 events => throughput is 9.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.9902s + [COUNTERS] Fortran Overhead ( 0 ) : 6.4881s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.5020s for 90112 events => throughput is 9.48E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655972E-007) and cpp (2.3322993078576736E-007) differ by less than 2E-4 (3.4640645907302314e-10) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993078576733E-007) differ by less than 2E-4 (3.464063480507207e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.407818e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.411937e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.084506e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.083264e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.114502e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112113e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.160968e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.161038e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.107573e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111465e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.113991e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.105445e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111189e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112837e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.632319e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.656493e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 3e31dbe95a..8e9ad5ba7a 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 + + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-02-05_22:34:08 +DATE: 2024-03-01_03:51:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4576s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3875s - [COUNTERS] Fortran MEs ( 1 ) : 0.0701s for 8192 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4944s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s + [COUNTERS] Fortran MEs ( 1 ) : 0.0697s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3843s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3144s - [COUNTERS] Fortran MEs ( 1 ) : 0.0699s for 8192 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3864s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3169s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.2820s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5165s - [COUNTERS] Fortran MEs ( 1 ) : 0.7655s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2522s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4897s + [COUNTERS] Fortran MEs ( 1 ) : 0.7625s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4725s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3960s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0765s for 8192 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4681s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3922s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0759s for 8192 events => throughput is 1.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561287] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561293] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.4597s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6104s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8493s for 90112 events => throughput is 1.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3698s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5419s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8279s for 90112 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561287) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561293) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.071916e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.084897e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.066839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.103096e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351262536] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351262530] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3995s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3576s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0419s for 8192 events => throughput is 1.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4004s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3592s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351262536) differ by less than 3E-14 (2.930988785010413e-14) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351262530) differ by less than 3E-14 (2.9531932455029164e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561281] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0138s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5775s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4364s for 90112 events => throughput is 2.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9658s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5172s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4486s for 90112 events => throughput is 2.01E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561290) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561281) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.048379e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.019219e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.053491e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.018294e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3649s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3418s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3643s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3408s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.49E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8252s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5671s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2581s for 90112 events => throughput is 3.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7585s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4994s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2591s for 90112 events => throughput is 3.48E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561295) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.435366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.297018e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.453652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.427747e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3613s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3408s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0205s for 8192 events => throughput is 3.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3623s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0225s for 8192 events => throughput is 3.65E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7846s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5597s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2249s for 90112 events => throughput is 4.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8132s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5645s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2486s for 90112 events => throughput is 3.62E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561295) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.968141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.905513e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.963827e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.866043e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3772s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3473s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0299s for 8192 events => throughput is 2.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3815s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3495s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.56E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8834s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5621s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3214s for 90112 events => throughput is 2.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8893s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5364s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3529s for 90112 events => throughput is 2.55E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561295) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.713679e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.640953e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.652984e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.543334e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263352] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351263363] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7431s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7425s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7465s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7458s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263352) differ by less than 3E-14 (8.881784197001252e-16) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263363) differ by less than 3E-14 (1.3322676295501878e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561298] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561304] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9618s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9543s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.21E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.9068s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8993s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.20E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686556561298) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561304) differ by less than 3E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.577372e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.589846e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.168278e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.058801e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.398056e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.383441e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.505593e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.512285e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.390610e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.382616e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.767356e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.771039e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.391773e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.376307e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.780303e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.776386e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 9909d81694..63166c80e0 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-02-05_22:34:38 +DATE: 2024-03-01_03:52:02 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4477s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3778s - [COUNTERS] Fortran MEs ( 1 ) : 0.0699s for 8192 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4536s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3837s + [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3823s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3126s - [COUNTERS] Fortran MEs ( 1 ) : 0.0698s for 8192 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3210s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.2711s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5092s - [COUNTERS] Fortran MEs ( 1 ) : 0.7619s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2714s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5019s + [COUNTERS] Fortran MEs ( 1 ) : 0.7695s for 90112 events => throughput is 1.17E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110461852325612] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110463093540638] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4797s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4071s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0727s for 8192 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3882s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0704s for 8192 events => throughput is 1.16E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110461852325612) differ by less than 4E-4 (2.8586276618058903e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110463093540638) differ by less than 4E-4 (2.812844174915341e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510685241079500] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686273216112] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.3864s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6235s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7628s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3150s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5373s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7777s for 90112 events => throughput is 1.16E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510685241079500) differ by less than 4E-4 (6.11548025553077e-08) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686273216112) differ by less than 4E-4 (1.3172298474195543e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.215484e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.170698e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.198326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.161745e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110456793177945] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110459152958460] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3638s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3404s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3657s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3405s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0252s for 8192 events => throughput is 3.25E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110456793177945) differ by less than 4E-4 (3.0452395031188573e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110459152958460) differ by less than 4E-4 (2.9581965829139634e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510681375304044] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510683016166510] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8134s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5557s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2577s for 90112 events => throughput is 3.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7697s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4943s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2753s for 90112 events => throughput is 3.27E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510681375304044) differ by less than 4E-4 (2.408689854238588e-07) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510683016166510) differ by less than 4E-4 (1.6458771667782202e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.485750e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.219045e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.433033e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.229652e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110458350871136] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3444s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3319s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3421s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3299s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.71E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110458350871136) differ by less than 4E-4 (2.987782395047489e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460595003461) differ by less than 4E-4 (2.9050052766654844e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510680866622453] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6853s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5504s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1349s for 90112 events => throughput is 6.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6208s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4844s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1364s for 90112 events => throughput is 6.61E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510680866622453) differ by less than 4E-4 (2.6451684009831666e-07) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ by less than 4E-4 (1.8848637739488083e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.565170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.431027e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.672553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.412727e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110458350871136] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3399s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3287s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3415s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3300s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0116s for 8192 events => throughput is 7.08E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110458350871136) differ by less than 4E-4 (2.987782395047489e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460595003461) differ by less than 4E-4 (2.9050052766654844e-06) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510680866622453] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7807s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6487s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1319s for 90112 events => throughput is 6.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6084s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1253s for 90112 events => throughput is 7.19E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510680866622453) differ by less than 4E-4 (2.6451684009831666e-07) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ by less than 4E-4 (1.8848637739488083e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.899301e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.891581e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.086975e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.928440e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110464176080312] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3525s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3371s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0155s for 8192 events => throughput is 5.29E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3483s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3329s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.33E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510685411522340] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510685411522326] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7160s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5505s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1654s for 90112 events => throughput is 5.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6561s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4840s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1721s for 90112 events => throughput is 5.24E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510685411522340) differ by less than 4E-4 (5.3231167029821336e-08) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510685411522326) differ by less than 4E-4 (5.3231167917999755e-08) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.309335e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.988554e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.077737e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.962392e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110478167944563] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7440s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7434s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.55E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7423s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7418s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -545,58 +545,58 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510689885789416] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510689885789414] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9615s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9556s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.54E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8968s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8910s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.53E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510689885789416) differ by less than 4E-4 (1.547708909921397e-07) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510689885789414) differ by less than 4E-4 (1.547708907700951e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.681067e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.824058e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.455476e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.473484e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.793086e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.891145e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.723997e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.706092e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.841062e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.798334e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.798906e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.787777e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.394007e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.356687e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.992718e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.028611e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index d0f21b96dd..eb4ca92d13 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' +make USEBUILDDIR=1 AVX=none - -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-02-05_22:35:08 +DATE: 2024-03-01_03:52:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4458s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3754s - [COUNTERS] Fortran MEs ( 1 ) : 0.0704s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4522s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3828s + [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3835s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3139s - [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3858s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3164s + [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561290] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.2916s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5236s - [COUNTERS] Fortran MEs ( 1 ) : 0.7681s for 90112 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2499s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4895s + [COUNTERS] Fortran MEs ( 1 ) : 0.7604s for 90112 events => throughput is 1.19E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539348915991] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4710s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3945s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0765s for 8192 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4694s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3943s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348915991) differ by less than 2E-4 (8.658396222216425e-11) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348916002) differ by less than 2E-4 (8.658362915525686e-11) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.4876s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6294s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8583s for 90112 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4601s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6015s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8586s for 90112 events => throughput is 1.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686560794334) differ by less than 2E-4 (1.967879192932287e-10) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794337) differ by less than 2E-4 (1.967879192932287e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.023469e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.100770e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.065490e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.090853e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3962s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3571s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0391s for 8192 events => throughput is 2.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3940s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0394s for 8192 events => throughput is 2.08E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0102s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5782s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4320s for 90112 events => throughput is 2.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9359s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5057s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4302s for 90112 events => throughput is 2.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686560794337) differ by less than 2E-4 (1.9678814133783362e-10) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794334) differ by less than 2E-4 (1.9678769724862377e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.046626e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.020468e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.000567e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.027641e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3734s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3500s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3636s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3406s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.56E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551750] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8128s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5602s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2526s for 90112 events => throughput is 3.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7468s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2545s for 90112 events => throughput is 3.54E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686558551750) differ by less than 2E-4 (9.2533536388828e-11) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.546873e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.536848e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.489419e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.536744e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3598s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3400s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0198s for 8192 events => throughput is 4.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3573s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3372s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0201s for 8192 events => throughput is 4.08E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551750] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7728s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5560s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2168s for 90112 events => throughput is 4.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7304s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5047s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2257s for 90112 events => throughput is 3.99E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686558551750) differ by less than 2E-4 (9.2533536388828e-11) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.091661e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.887668e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.136379e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.834847e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3802s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3498s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0304s for 8192 events => throughput is 2.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4046s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3689s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 8192 events => throughput is 2.30E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551750] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9114s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5781s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3333s for 90112 events => throughput is 2.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9542s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5763s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3779s for 90112 events => throughput is 2.38E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686558551750) differ by less than 2E-4 (9.2533536388828e-11) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.656196e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.510568e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.642304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.320811e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539343558532] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539343558537] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7437s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7430s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7473s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7466s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539343558532) differ by less than 2E-4 (2.8419933073564607e-10) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539343558537) differ by less than 2E-4 (2.8419910869104115e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9687s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9611s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 90112 events => throughput is 1.19E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8944s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8868s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.19E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561290) and cpp (0.21510686553631395) differ by less than 2E-4 (1.3620649053081024e-10) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686553631395) differ by less than 2E-4 (1.3620671257541517e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.582708e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.579519e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.992203e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.134868e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.392597e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.391789e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.507161e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.511629e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.373790e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.394001e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.765106e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.800973e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.396492e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.396936e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.772367e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.776316e+07 ) sec^-1 TEST COMPLETED From ca46e82882326ca755b4d0fa8d133c40d10398b6 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 12:38:12 +0200 Subject: [PATCH 85/96] [susy2] bug fix in tput/teeThroughputX.sh: rename -rorhst as -hirhst and comment it out (this is not supported in tput/throughputX.sh and in cudacpp anyway) --- epochX/cudacpp/tput/teeThroughputX.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/tput/teeThroughputX.sh b/epochX/cudacpp/tput/teeThroughputX.sh index de0a1e912a..5f7ab08180 100755 --- a/epochX/cudacpp/tput/teeThroughputX.sh +++ b/epochX/cudacpp/tput/teeThroughputX.sh @@ -93,8 +93,8 @@ for arg in $*; do rndgen=$arg elif [ "$arg" == "-curhst" ]; then rndgen=$arg - elif [ "$arg" == "-rorhst" ]; then - rndgen=$arg + ###elif [ "$arg" == "-hirhst" ]; then + ### rndgen=$arg elif [ "$arg" == "-rmbhst" ]; then rmbsmp=$arg elif [ "$arg" == "-bridge" ]; then From a14315948898429ffd7736ea0f801a21691c812f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 12:40:52 +0200 Subject: [PATCH 86/96] [susy2] bug fix in tput/throughputX.sh: comment out -hirhst (which is not mentioned in usage, and is not supported in cudacpp code anyway) --- epochX/cudacpp/tput/throughputX.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh index efb282fc58..b2fff3a632 100755 --- a/epochX/cudacpp/tput/throughputX.sh +++ b/epochX/cudacpp/tput/throughputX.sh @@ -187,9 +187,9 @@ while [ "$1" != "" ]; do elif [ "$1" == "-curhst" ]; then rndgen=" -${1}" shift - elif [ "$1" == "-hirhst" ]; then - rndgen=" -${1}" - shift + ###elif [ "$1" == "-hirhst" ]; then + ### rndgen=" -${1}" + ### shift elif [ "$1" == "-rmbhst" ]; then rmbsmp=" -${1}" shift From 01096bbe4e549f18d352d6ed312e8f6202c6b8ac Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 16:26:25 +0200 Subject: [PATCH 87/96] [susy2] rerun 72 tput tests on LUMI - all ok (with known errors on gqttq) (1) Build tests on login node (~22h) STARTED AT Fri 01 Mar 2024 12:11:58 AM EET ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean -makeonly ENDED(1) AT Fri 01 Mar 2024 09:30:18 PM EET [Status=0] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean -makeonly ENDED(2) AT Fri 01 Mar 2024 09:59:02 PM EET [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean -makeonly ENDED(3) AT Fri 01 Mar 2024 10:11:51 PM EET [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst -makeonly ENDED(4) AT Fri 01 Mar 2024 10:14:10 PM EET [Status=0] SKIP './tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common -makeonly' ENDED(5) AT Fri 01 Mar 2024 10:14:10 PM EET [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common -makeonly ENDED(6) AT Fri 01 Mar 2024 10:16:26 PM EET [Status=0] (2) Step 2 - run tests on worker nodes (~1h30) ./tput/allTees.sh -hip STARTED AT Sun 03 Mar 2024 01:29:52 PM EET ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Sun 03 Mar 2024 02:22:12 PM EET [Status=2] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Sun 03 Mar 2024 02:39:13 PM EET [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Sun 03 Mar 2024 02:58:20 PM EET [Status=2] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Sun 03 Mar 2024 03:02:06 PM EET [Status=0] SKIP './tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common ' ENDED(5) AT Sun 03 Mar 2024 03:02:06 PM EET [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common ENDED(6) AT Sun 03 Mar 2024 03:05:50 PM EET [Status=0] ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt:ERROR! Fortran calculation (F77/CUDA) crashed --- .../log_eemumu_mad_d_inl0_hrd0.txt | 238 ++++++--------- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 246 ++++++--------- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 221 ++++++-------- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 241 ++++++--------- .../log_eemumu_mad_d_inl0_hrd1.txt | 234 ++++++-------- .../log_eemumu_mad_d_inl1_hrd0.txt | 238 ++++++--------- .../log_eemumu_mad_d_inl1_hrd1.txt | 238 ++++++--------- .../log_eemumu_mad_f_inl0_hrd0.txt | 248 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 254 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 231 ++++++-------- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 249 +++++++-------- .../log_eemumu_mad_f_inl0_hrd1.txt | 248 +++++++-------- .../log_eemumu_mad_f_inl1_hrd0.txt | 248 +++++++-------- .../log_eemumu_mad_f_inl1_hrd1.txt | 248 +++++++-------- .../log_eemumu_mad_m_inl0_hrd0.txt | 234 ++++++-------- .../log_eemumu_mad_m_inl0_hrd1.txt | 234 ++++++-------- .../log_ggtt_mad_d_inl0_hrd0.txt | 238 ++++++--------- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 246 ++++++--------- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 221 ++++++-------- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 241 ++++++--------- .../log_ggtt_mad_d_inl0_hrd1.txt | 234 ++++++-------- .../log_ggtt_mad_d_inl1_hrd0.txt | 238 ++++++--------- .../log_ggtt_mad_d_inl1_hrd1.txt | 234 ++++++-------- .../log_ggtt_mad_f_inl0_hrd0.txt | 252 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 260 +++++++--------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 241 ++++++--------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 255 +++++++--------- .../log_ggtt_mad_f_inl0_hrd1.txt | 252 +++++++-------- .../log_ggtt_mad_f_inl1_hrd0.txt | 252 +++++++-------- .../log_ggtt_mad_f_inl1_hrd1.txt | 252 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0.txt | 238 ++++++--------- .../log_ggtt_mad_m_inl0_hrd1.txt | 238 ++++++--------- .../log_ggttg_mad_d_inl0_hrd0.txt | 263 +++++++--------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 273 +++++++---------- .../log_ggttg_mad_d_inl0_hrd1.txt | 263 +++++++--------- .../log_ggttg_mad_f_inl0_hrd0.txt | 277 ++++++++--------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 287 ++++++++--------- .../log_ggttg_mad_f_inl0_hrd1.txt | 277 ++++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 259 +++++++--------- .../log_ggttg_mad_m_inl0_hrd1.txt | 259 +++++++--------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 263 +++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 273 +++++++---------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 244 ++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 268 +++++++--------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 263 +++++++--------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 271 +++++++--------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 271 +++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 279 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 289 ++++++++---------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 270 +++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 284 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 277 ++++++++--------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 275 +++++++---------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 275 +++++++---------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 259 +++++++--------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 259 +++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 259 +++++++--------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 269 +++++++--------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 259 +++++++--------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 275 +++++++---------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 285 ++++++++--------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 275 +++++++---------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 259 +++++++--------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 259 +++++++--------- .../log_gqttq_mad_d_inl0_hrd0.txt | 258 +++++----------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 270 +++++----------- .../log_gqttq_mad_d_inl0_hrd1.txt | 258 +++++----------- .../log_gqttq_mad_f_inl0_hrd0.txt | 258 +++++----------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 270 +++++----------- .../log_gqttq_mad_f_inl0_hrd1.txt | 258 +++++----------- .../log_gqttq_mad_m_inl0_hrd0.txt | 258 +++++----------- .../log_gqttq_mad_m_inl0_hrd1.txt | 258 +++++----------- 72 files changed, 7422 insertions(+), 11026 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index baa8c044cd..ad41cc6bfb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:23:52 +DATE: 2024-03-03_14:01:50 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.465816e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.330908e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.240172e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.907657 sec - 2,864,594,511 cycles # 3.017 GHz - 4,419,491,827 instructions # 1.54 insn per cycle - 1.243823060 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.324596e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.112739e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.342398e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.859560 sec + 1,321,428,084 cycles:u # 1.235 GHz (74.81%) + 2,150,562 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.38%) + 5,995,950 stalled-cycles-backend:u # 0.45% backend cycles idle (74.89%) + 2,074,374,266 instructions:u # 1.57 insn per cycle + # 0.00 stalled cycles per insn (75.26%) + 1.422954208 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.117981e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.310106e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.310106e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.029383 sec - 18,345,746,310 cycles # 3.041 GHz - 43,971,705,846 instructions # 2.40 insn per cycle - 6.038464488 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.252614e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.432573e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.432573e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.754991 sec + 19,449,019,198 cycles:u # 3.361 GHz (74.99%) + 51,261,991 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.99%) + 55,689,936 stalled-cycles-backend:u # 0.29% backend cycles idle (74.91%) + 47,091,295,354 instructions:u # 2.42 insn per cycle + # 0.00 stalled cycles per insn (74.95%) + 5.789720623 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.673850e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.186329e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.186329e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.154865 sec - 12,823,382,487 cycles # 3.082 GHz - 30,998,172,347 instructions # 2.42 insn per cycle - 4.171623433 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.927684e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.425335e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.425335e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.992105 sec + 13,278,223,442 cycles:u # 3.301 GHz (74.95%) + 52,332,939 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.96%) + 1,017,469,682 stalled-cycles-backend:u # 7.66% backend cycles idle (74.96%) + 31,194,174,433 instructions:u # 2.35 insn per cycle + # 0.03 stalled cycles per insn (74.96%) + 4.028308592 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.086690e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.914110e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.914110e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.406763 sec - 10,081,289,557 cycles # 2.955 GHz - 19,366,111,959 instructions # 1.92 insn per cycle - 3.427414790 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.667453e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.548951e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.548951e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.088586 sec + 10,124,819,179 cycles:u # 3.246 GHz (74.93%) + 47,833,768 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.89%) + 414,834,375 stalled-cycles-backend:u # 4.10% backend cycles idle (74.87%) + 19,410,775,333 instructions:u # 1.92 insn per cycle + # 0.02 stalled cycles per insn (74.97%) + 3.123576071 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.191873e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.083636e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.083636e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.257696 sec - 9,685,682,355 cycles # 2.968 GHz - 18,976,171,527 instructions # 1.96 insn per cycle - 3.273948471 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.805262e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.408203e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.408203e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.888242 sec - 8,621,851,062 cycles # 2.214 GHz - 15,727,334,662 instructions # 1.82 insn per cycle - 3.905958468 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index b9ff72dbf3..02c3c2eb21 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,223 +1,175 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:12:58 +DATE: 2024-03-03_14:51:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.687342e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.551417e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.551417e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.232505 sec - 7,524,955,995 cycles # 3.041 GHz - 13,468,669,108 instructions # 1.79 insn per cycle - 2.532807464 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.474799e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.308389e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.308389e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.547677 sec + 18,355,875,731 cycles:u # 3.290 GHz (74.87%) + 120,621,827 stalled-cycles-frontend:u # 0.66% frontend cycles idle (74.92%) + 6,921,495,641 stalled-cycles-backend:u # 37.71% backend cycles idle (75.07%) + 17,150,236,773 instructions:u # 0.93 insn per cycle + # 0.40 stalled cycles per insn (75.06%) + 5.611745225 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.081573e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.260544e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.260544e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.415532 sec - 19,561,606,037 cycles # 3.046 GHz - 44,198,639,919 instructions # 2.26 insn per cycle - 6.422457347 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.236454e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.411058e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.411058e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.925739 sec + 19,889,727,529 cycles:u # 3.334 GHz (74.92%) + 51,462,014 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.92%) + 116,028,144 stalled-cycles-backend:u # 0.58% backend cycles idle (74.99%) + 47,213,004,904 instructions:u # 2.37 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 5.968530389 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.552230e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.996603e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.996603e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.664054 sec - 13,997,557,946 cycles # 2.998 GHz - 31,841,279,233 instructions # 2.27 insn per cycle - 4.670791737 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.876382e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.345724e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.345724e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.222665 sec + 13,876,988,167 cycles:u # 3.256 GHz (75.04%) + 53,634,197 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.04%) + 1,081,837,065 stalled-cycles-backend:u # 7.80% backend cycles idle (74.88%) + 32,066,862,891 instructions:u # 2.31 insn per cycle + # 0.03 stalled cycles per insn (74.88%) + 4.266640956 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.951455e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.660973e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.660973e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.823801 sec - 11,324,833,068 cycles # 2.957 GHz - 20,724,775,427 instructions # 1.83 insn per cycle - 3.830534322 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 2.550661e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.348496e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.348496e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.349151 sec + 10,813,502,985 cycles:u # 3.190 GHz (75.01%) + 50,474,326 stalled-cycles-frontend:u # 0.47% frontend cycles idle (75.00%) + 406,381,998 stalled-cycles-backend:u # 3.76% backend cycles idle (75.00%) + 20,750,963,840 instructions:u # 1.92 insn per cycle + # 0.02 stalled cycles per insn (74.98%) + 3.393666389 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.028218e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.792747e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.792747e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.704930 sec - 10,963,593,820 cycles # 2.954 GHz - 20,347,072,159 instructions # 1.86 insn per cycle - 3.711957869 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.747913e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.283053e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.283053e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.214412 sec - 9,956,996,891 cycles # 2.360 GHz - 16,873,658,319 instructions # 1.69 insn per cycle - 4.221168968 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 09aaad1dd8..0b5bb72c22 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,210 +1,165 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:26:09 +DATE: 2024-03-03_15:03:14 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.492636e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.583078e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.097014e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.256929e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.117141e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.346881e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.329039 sec - 4,626,136,964 cycles # 2.966 GHz - 7,229,705,832 instructions # 1.56 insn per cycle - 1.616136536 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 4.665450 sec + 15,369,141,348 cycles:u # 3.275 GHz (75.00%) + 53,792,556 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.98%) + 6,901,191,727 stalled-cycles-backend:u # 44.90% backend cycles idle (74.94%) + 11,509,551,541 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (74.93%) + 4.717205353 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.120496e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.314160e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.314160e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.250207e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.429729e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.429729e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.368910 sec - 19,436,039,687 cycles # 3.050 GHz - 44,075,637,403 instructions # 2.27 insn per cycle - 6.374367735 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.768692 sec + 19,540,266,544 cycles:u # 3.370 GHz (74.89%) + 50,423,911 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.96%) + 72,392,327 stalled-cycles-backend:u # 0.37% backend cycles idle (75.03%) + 47,003,028,682 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 5.801685990 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.684337e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.204179e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.204179e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.928009e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.423643e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.423643e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.477126 sec - 13,840,650,655 cycles # 3.088 GHz - 31,000,398,658 instructions # 2.24 insn per cycle - 4.482579907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.989684 sec + 13,290,935,841 cycles:u # 3.306 GHz (74.93%) + 52,308,037 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.94%) + 1,092,395,998 stalled-cycles-backend:u # 8.22% backend cycles idle (74.94%) + 31,201,564,853 instructions:u # 2.35 insn per cycle + # 0.04 stalled cycles per insn (74.93%) + 4.022478905 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.074274e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.910197e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.910197e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.658913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.545624e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.545624e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.779571 sec - 11,221,356,305 cycles # 2.967 GHz - 19,268,573,834 instructions # 1.72 insn per cycle - 3.784933241 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +TOTAL : 3.096330 sec + 10,149,407,506 cycles:u # 3.246 GHz (74.93%) + 49,215,562 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.82%) + 419,084,250 stalled-cycles-backend:u # 4.13% backend cycles idle (74.82%) + 19,401,968,357 instructions:u # 1.91 insn per cycle + # 0.02 stalled cycles per insn (74.96%) + 3.129271544 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.174998e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.082449e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.082449e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.643336 sec - 10,818,026,445 cycles # 2.966 GHz - 18,676,470,141 instructions # 1.73 insn per cycle - 3.648853496 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.875863e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.507498e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.507498e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.111357 sec - 9,725,602,646 cycles # 2.364 GHz - 15,429,502,829 instructions # 1.59 insn per cycle - 4.116843302 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 4a4acadae4..5417009137 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,212 +1,169 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:19:38 +DATE: 2024-03-03_14:59:28 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.223584e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.552038e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.038459e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.841184 sec - 6,281,268,865 cycles # 3.032 GHz - 11,616,541,551 instructions # 1.85 insn per cycle - 2.127335919 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.514751e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.084739e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.314117e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.378218 sec + 17,859,605,335 cycles:u # 3.302 GHz (75.01%) + 119,964,353 stalled-cycles-frontend:u # 0.67% frontend cycles idle (75.01%) + 6,837,606,477 stalled-cycles-backend:u # 38.29% backend cycles idle (75.06%) + 16,740,556,145 instructions:u # 0.94 insn per cycle + # 0.41 stalled cycles per insn (75.04%) + 5.429728919 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.136861e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.332827e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.332827e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.931254 sec - 18,320,874,631 cycles # 3.087 GHz - 43,971,483,251 instructions # 2.40 insn per cycle - 5.936943481 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.251851e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.431006e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.431006e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.756539 sec + 19,485,062,223 cycles:u # 3.368 GHz (74.98%) + 51,536,175 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.98%) + 59,518,097 stalled-cycles-backend:u # 0.31% backend cycles idle (74.99%) + 47,089,103,622 instructions:u # 2.42 insn per cycle + # 0.00 stalled cycles per insn (74.92%) + 5.789154373 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.678735e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.191487e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.191487e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.142725 sec - 12,747,370,194 cycles # 3.074 GHz - 30,997,666,885 instructions # 2.43 insn per cycle - 4.148307465 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.934074e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.436656e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.436656e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.982711 sec + 13,292,890,354 cycles:u # 3.312 GHz (74.90%) + 52,677,465 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.88%) + 1,040,209,336 stalled-cycles-backend:u # 7.83% backend cycles idle (74.94%) + 31,137,052,240 instructions:u # 2.34 insn per cycle + # 0.03 stalled cycles per insn (75.04%) + 4.015553191 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.080045e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.910176e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.910176e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.411600 sec - 10,085,079,136 cycles # 2.953 GHz - 19,364,558,625 instructions # 1.92 insn per cycle - 3.417084709 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.669189e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.552335e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.552335e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.086401 sec + 10,130,156,702 cycles:u # 3.250 GHz (74.87%) + 47,813,678 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.85%) + 431,077,569 stalled-cycles-backend:u # 4.26% backend cycles idle (74.95%) + 19,360,973,007 instructions:u # 1.91 insn per cycle + # 0.02 stalled cycles per insn (75.08%) + 3.118993026 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.138969e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.032835e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.032835e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.338836 sec - 9,731,023,917 cycles # 2.911 GHz - 18,988,816,377 instructions # 1.95 insn per cycle - 3.344328310 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.865281e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.489559e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.489559e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.766791 sec - 8,586,243,314 cycles # 2.277 GHz - 15,726,194,960 instructions # 1.83 insn per cycle - 3.772300478 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index acaec4a100..79e3941e0f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:24:28 +DATE: 2024-03-03_14:02:18 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.477749e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.322801e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.215924e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.699180 sec - 2,815,032,547 cycles # 3.020 GHz - 4,411,732,319 instructions # 1.57 insn per cycle - 1.012826906 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.920060e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.601512e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.923626e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.491128 sec + 1,290,175,959 cycles:u # 2.505 GHz (75.02%) + 2,375,328 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.49%) + 5,596,436 stalled-cycles-backend:u # 0.43% backend cycles idle (74.55%) + 2,025,639,080 instructions:u # 1.57 insn per cycle + # 0.00 stalled cycles per insn (74.55%) + 0.545806838 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165216E-002 +Relative difference = 1.0277079305077159e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.177941e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.396494e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.396494e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.744811 sec - 17,454,360,700 cycles # 3.039 GHz - 41,822,159,126 instructions # 2.40 insn per cycle - 5.754685240 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.324266e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.526356e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.526356e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.479692 sec + 18,527,886,497 cycles:u # 3.363 GHz (74.94%) + 51,303,501 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.01%) + 71,667,286 stalled-cycles-backend:u # 0.39% backend cycles idle (75.03%) + 44,741,161,010 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 5.513054212 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 485) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.724349e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.269291e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.269291e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.046627 sec - 12,493,235,601 cycles # 3.083 GHz - 30,160,547,265 instructions # 2.41 insn per cycle - 4.067076512 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.023611e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.577643e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.577643e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.835861 sec + 12,754,913,480 cycles:u # 3.299 GHz (74.99%) + 52,420,621 stalled-cycles-frontend:u # 0.41% frontend cycles idle (74.97%) + 73,335,748 stalled-cycles-backend:u # 0.57% backend cycles idle (74.97%) + 30,104,394,151 instructions:u # 2.36 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 3.870675775 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.121345e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.968992e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.968992e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.357760 sec - 9,927,136,910 cycles # 2.952 GHz - 19,096,793,241 instructions # 1.92 insn per cycle - 3.375474470 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.602054e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.435734e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.435734e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.149196 sec + 10,399,600,756 cycles:u # 3.270 GHz (74.90%) + 52,047,342 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.84%) + 302,365,095 stalled-cycles-backend:u # 2.91% backend cycles idle (74.92%) + 18,912,332,875 instructions:u # 1.82 insn per cycle + # 0.02 stalled cycles per insn (75.05%) + 3.184201759 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1884) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.204942e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.126738e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.126738e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.243150 sec - 9,616,213,299 cycles # 2.960 GHz - 18,757,748,925 instructions # 1.95 insn per cycle - 3.265371118 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.914682e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.579340e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.579340e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.680994 sec - 8,464,459,891 cycles # 2.296 GHz - 15,603,182,673 instructions # 1.84 insn per cycle - 3.700542167 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 5e36a6ad1c..d45f8d9d60 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:02:07 +DATE: 2024-03-03_14:32:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.482201e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.589772e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.144008e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.677531 sec - 2,738,360,567 cycles # 3.010 GHz - 4,202,554,319 instructions # 1.53 insn per cycle - 0.971727419 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.257790e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.106060e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.334611e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.510314 sec + 1,324,021,627 cycles:u # 2.494 GHz (74.45%) + 2,359,804 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.14%) + 5,360,054 stalled-cycles-backend:u # 0.40% backend cycles idle (74.44%) + 2,075,244,536 instructions:u # 1.57 insn per cycle + # 0.00 stalled cycles per insn (75.14%) + 0.565639760 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.697362e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.176157e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.176157e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.107132 sec - 12,669,493,888 cycles # 3.081 GHz - 32,513,570,576 instructions # 2.57 insn per cycle - 4.112837024 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.786118e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.174032e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.174032e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.252899 sec + 14,197,263,778 cycles:u # 3.313 GHz (74.99%) + 51,197,569 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.99%) + 401,360,207 stalled-cycles-backend:u # 2.83% backend cycles idle (75.00%) + 36,769,260,973 instructions:u # 2.59 insn per cycle + # 0.01 stalled cycles per insn (75.00%) + 4.288026881 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.109105e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.012747e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.012747e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.385880 sec - 10,259,128,837 cycles # 3.025 GHz - 24,473,597,991 instructions # 2.39 insn per cycle - 3.391687112 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.413369e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.248265e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.248265e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.336198 sec + 11,000,487,394 cycles:u # 3.267 GHz (74.87%) + 51,674,634 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.99%) + 251,927,164 stalled-cycles-backend:u # 2.29% backend cycles idle (75.05%) + 24,665,290,150 instructions:u # 2.24 insn per cycle + # 0.01 stalled cycles per insn (75.06%) + 3.371408592 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.263099e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.319180e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.319180e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.179158 sec - 9,139,183,085 cycles # 2.870 GHz - 16,922,980,195 instructions # 1.85 insn per cycle - 3.185130704 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.010543e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.186949e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.186949e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.820748 sec + 9,177,598,955 cycles:u # 3.218 GHz (74.96%) + 48,256,657 stalled-cycles-frontend:u # 0.53% frontend cycles idle (75.04%) + 140,263,482 stalled-cycles-backend:u # 1.53% backend cycles idle (75.04%) + 16,797,741,901 instructions:u # 1.83 insn per cycle + # 0.01 stalled cycles per insn (75.04%) + 2.856117399 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.177097e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.324804e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.324804e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.299126 sec - 9,225,486,663 cycles # 2.804 GHz - 16,350,529,622 instructions # 1.77 insn per cycle - 3.305119215 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.061533e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.856351e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.856351e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.449960 sec - 7,914,148,444 cycles # 2.292 GHz - 14,582,993,732 instructions # 1.84 insn per cycle - 3.455623027 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 640cde8efe..2554b2b401 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:02:37 +DATE: 2024-03-03_14:32:55 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.480008e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.624168e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.202092e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.676373 sec - 2,668,503,996 cycles # 2.929 GHz - 4,153,523,497 instructions # 1.56 insn per cycle - 0.971892133 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.884470e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.600483e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.922931e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.490936 sec + 1,262,401,840 cycles:u # 2.452 GHz (74.12%) + 2,354,272 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.27%) + 5,127,550 stalled-cycles-backend:u # 0.41% backend cycles idle (75.30%) + 2,046,536,140 instructions:u # 1.62 insn per cycle + # 0.00 stalled cycles per insn (75.85%) + 0.545640180 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165216E-002 +Relative difference = 1.0277079305077159e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.254295e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.186891e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.186891e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.188433 sec - 9,833,021,244 cycles # 3.080 GHz - 25,393,539,961 instructions # 2.58 insn per cycle - 3.194101979 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.448227e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.241157e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.241157e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.300684 sec + 10,864,613,767 cycles:u # 3.260 GHz (74.93%) + 50,513,894 stalled-cycles-frontend:u # 0.46% frontend cycles idle (75.04%) + 58,038,331 stalled-cycles-backend:u # 0.53% backend cycles idle (75.04%) + 28,417,485,541 instructions:u # 2.62 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 3.335392064 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.515638e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.869932e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.869932e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.899703 sec - 8,920,893,128 cycles # 3.072 GHz - 21,482,466,118 instructions # 2.41 insn per cycle - 2.905533602 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.640077e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.678024e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.678024e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.110817 sec + 10,212,854,193 cycles:u # 3.251 GHz (74.84%) + 49,334,287 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.91%) + 54,421,651 stalled-cycles-backend:u # 0.53% backend cycles idle (75.04%) + 21,633,351,327 instructions:u # 2.12 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 3.146041531 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.523191e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.858970e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.858970e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.882396 sec - 8,595,793,495 cycles # 2.978 GHz - 15,810,706,009 instructions # 1.84 insn per cycle - 2.888136564 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.296083e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.760867e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.760867e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.640234 sec + 8,552,163,954 cycles:u # 3.202 GHz (74.92%) + 48,388,466 stalled-cycles-frontend:u # 0.57% frontend cycles idle (74.87%) + 139,356,364 stalled-cycles-backend:u # 1.63% backend cycles idle (74.85%) + 15,849,743,897 instructions:u # 1.85 insn per cycle + # 0.01 stalled cycles per insn (74.94%) + 2.675594881 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1479) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165084E-002 +Relative difference = 1.0277089582483854e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.508044e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.828642e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.828642e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.905551 sec - 8,435,887,633 cycles # 2.898 GHz - 15,503,428,881 instructions # 1.84 insn per cycle - 2.911395780 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.236518e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.188285e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.188285e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.208349 sec - 7,562,205,797 cycles # 2.353 GHz - 14,282,233,625 instructions # 1.89 insn per cycle - 3.214128577 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 4388b968c1..192f203417 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:25:01 +DATE: 2024-03-03_14:02:44 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.096246e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.080730e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.278086e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.584592 sec - 2,424,873,450 cycles # 2.992 GHz - 3,757,113,510 instructions # 1.55 insn per cycle - 0.891497126 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.894573e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.208938e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.968247e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 +TOTAL : 0.384630 sec + 968,263,305 cycles:u # 2.395 GHz (74.33%) + 2,353,929 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.20%) + 4,170,768 stalled-cycles-backend:u # 0.43% backend cycles idle (73.94%) + 1,766,876,262 instructions:u # 1.82 insn per cycle + # 0.00 stalled cycles per insn (76.21%) + 0.433113828 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.144766e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.356973e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.356973e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.861200 sec - 17,835,681,737 cycles # 3.040 GHz - 43,512,863,183 instructions # 2.44 insn per cycle - 5.870178360 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.421004e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.651144e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.651144e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.098926 sec + 17,283,392,500 cycles:u # 3.371 GHz (75.02%) + 39,987,964 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.04%) + 29,814,946 stalled-cycles-backend:u # 0.17% backend cycles idle (75.04%) + 47,182,351,433 instructions:u # 2.73 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 5.129339329 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.374028e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.640654e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.640654e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.010180 sec - 9,264,818,102 cycles # 3.072 GHz - 21,907,230,972 instructions # 2.36 insn per cycle - 3.030108679 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.949337e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.195301e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.195301e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.811402 sec + 9,236,170,641 cycles:u # 3.254 GHz (74.93%) + 40,377,405 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.92%) + 911,350,097 stalled-cycles-backend:u # 9.87% backend cycles idle (74.94%) + 22,187,807,240 instructions:u # 2.40 insn per cycle + # 0.04 stalled cycles per insn (74.94%) + 2.842564880 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.583102e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.970498e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.970498e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.786671 sec - 8,293,439,755 cycles # 2.970 GHz - 15,591,050,714 instructions # 1.88 insn per cycle - 2.803351674 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 3.416658e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.002062e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.002062e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.515569 sec + 8,226,874,549 cycles:u # 3.236 GHz (74.74%) + 42,467,746 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.71%) + 1,634,949,869 stalled-cycles-backend:u # 19.87% backend cycles idle (75.00%) + 15,532,344,605 instructions:u # 1.89 insn per cycle + # 0.11 stalled cycles per insn (75.14%) + 2.546576367 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.519812e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.882018e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.882018e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.857922 sec - 8,240,284,445 cycles # 2.878 GHz - 15,434,807,288 instructions # 1.87 insn per cycle - 2.873134335 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.640401e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.080150e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.080150e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.738177 sec - 6,634,758,903 cycles # 2.418 GHz - 12,863,535,626 instructions # 1.94 insn per cycle - 2.752418443 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 5ebf98d844..13d81de7e2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,223 +1,175 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:13:35 +DATE: 2024-03-03_14:52:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.291092e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.500878e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.500878e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.664885 sec - 5,743,008,286 cycles # 3.032 GHz - 10,353,112,228 instructions # 1.80 insn per cycle - 1.950710268 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.603281e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.306866e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.306866e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 +TOTAL : 5.394868 sec + 17,747,723,699 cycles:u # 3.296 GHz (75.06%) + 116,293,455 stalled-cycles-frontend:u # 0.66% frontend cycles idle (75.05%) + 6,964,156,661 stalled-cycles-backend:u # 39.24% backend cycles idle (75.08%) + 17,085,845,774 instructions:u # 0.96 insn per cycle + # 0.41 stalled cycles per insn (75.02%) + 5.448649910 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.118079e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.318846e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.318846e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.094512 sec - 18,492,834,117 cycles # 3.035 GHz - 43,665,828,462 instructions # 2.36 insn per cycle - 6.100764200 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.407290e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.631935e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.631935e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.209030 sec + 17,512,513,676 cycles:u # 3.342 GHz (74.97%) + 39,347,428 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.98%) + 73,710,188 stalled-cycles-backend:u # 0.42% backend cycles idle (74.98%) + 47,446,515,991 instructions:u # 2.71 insn per cycle + # 0.00 stalled cycles per insn (74.97%) + 5.243117391 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.278046e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.410824e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.410824e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.242674 sec - 9,984,073,322 cycles # 3.074 GHz - 23,241,211,318 instructions # 2.33 insn per cycle - 3.248988906 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.843798e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.993414e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.993414e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.973264 sec + 9,687,625,736 cycles:u # 3.224 GHz (74.98%) + 41,442,139 stalled-cycles-frontend:u # 0.43% frontend cycles idle (74.98%) + 958,118,871 stalled-cycles-backend:u # 9.89% backend cycles idle (74.87%) + 23,576,025,926 instructions:u # 2.43 insn per cycle + # 0.04 stalled cycles per insn (74.87%) + 3.009926908 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.460715e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.687913e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.687913e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.031931 sec - 9,018,287,343 cycles # 2.969 GHz - 16,710,480,351 instructions # 1.85 insn per cycle - 3.038355322 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 3.317531e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.795026e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.795026e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.649491 sec + 8,549,300,727 cycles:u # 3.188 GHz (74.94%) + 42,286,661 stalled-cycles-frontend:u # 0.49% frontend cycles idle (74.96%) + 1,634,655,298 stalled-cycles-backend:u # 19.12% backend cycles idle (74.96%) + 16,685,077,328 instructions:u # 1.95 insn per cycle + # 0.10 stalled cycles per insn (74.94%) + 2.685772670 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.487042e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.742069e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.742069e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.003313 sec - 8,924,279,581 cycles # 2.966 GHz - 16,553,851,203 instructions # 1.85 insn per cycle - 3.009721457 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.456097e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.675362e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.675362e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.047824 sec - 7,411,564,908 cycles # 2.428 GHz - 14,070,800,087 instructions # 1.90 insn per cycle - 3.054259465 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 57f3a9eb6a..6f27605efb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,210 +1,165 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:26:45 +DATE: 2024-03-03_15:03:45 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.305418e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.176873e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.254170e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.176348 sec - 4,160,459,328 cycles # 2.977 GHz - 6,608,736,714 instructions # 1.59 insn per cycle - 1.454481545 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.826583e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.171348e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.927510e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.542977 sec + 15,015,738,206 cycles:u # 3.288 GHz (75.04%) + 54,154,072 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.83%) + 6,892,515,673 stalled-cycles-backend:u # 45.90% backend cycles idle (74.86%) + 11,015,292,588 instructions:u # 0.73 insn per cycle + # 0.63 stalled cycles per insn (75.12%) + 4.592827207 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.163258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.379965e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.379965e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.412776e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.637786e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.637786e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.084905 sec - 18,848,150,042 cycles # 3.095 GHz - 43,694,410,467 instructions # 2.32 insn per cycle - 6.090122961 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.125912 sec + 17,371,936,407 cycles:u # 3.372 GHz (74.98%) + 39,561,209 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.00%) + 71,336,012 stalled-cycles-backend:u # 0.41% backend cycles idle (75.00%) + 47,201,038,679 instructions:u # 2.72 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 5.154937938 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.362188e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.607795e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.607795e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.929734e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.146974e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.146974e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.340145 sec - 10,237,006,523 cycles # 3.061 GHz - 21,987,992,116 instructions # 2.15 insn per cycle - 3.345494687 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.822597 sec + 9,288,450,283 cycles:u # 3.260 GHz (74.99%) + 40,549,445 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.01%) + 929,075,556 stalled-cycles-backend:u # 10.00% backend cycles idle (75.01%) + 22,142,860,809 instructions:u # 2.38 insn per cycle + # 0.04 stalled cycles per insn (75.01%) + 2.851562031 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.557177e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.937995e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.937995e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.414461e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.996487e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.996487e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.130033 sec - 9,276,164,079 cycles # 2.959 GHz - 15,501,530,354 instructions # 1.67 insn per cycle - 3.135291294 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +TOTAL : 2.516320 sec + 8,209,636,772 cycles:u # 3.229 GHz (74.86%) + 42,016,189 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.86%) + 1,641,180,581 stalled-cycles-backend:u # 19.99% backend cycles idle (74.92%) + 15,546,166,497 instructions:u # 1.89 insn per cycle + # 0.11 stalled cycles per insn (75.08%) + 2.545171918 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.607828e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.022471e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.022471e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.090209 sec - 9,218,829,691 cycles # 2.980 GHz - 15,143,949,757 instructions # 1.64 insn per cycle - 3.095551418 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.625698e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.049871e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.049871e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.081111 sec - 7,633,670,846 cycles # 2.474 GHz - 12,572,894,419 instructions # 1.65 insn per cycle - 3.086406325 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 8d8716bc9a..75ba62d6e0 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,212 +1,169 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:20:14 +DATE: 2024-03-03_14:59:59 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.282885e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.142631e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.141870e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.452761 sec - 5,067,036,613 cycles # 3.030 GHz - 9,262,361,364 instructions # 1.83 insn per cycle - 1.731002061 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.352458e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.000530e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.714853e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 +TOTAL : 5.254995 sec + 17,501,585,656 cycles:u # 3.313 GHz (75.02%) + 117,376,845 stalled-cycles-frontend:u # 0.67% frontend cycles idle (75.03%) + 6,895,746,974 stalled-cycles-backend:u # 39.40% backend cycles idle (75.03%) + 16,722,875,951 instructions:u # 0.96 insn per cycle + # 0.41 stalled cycles per insn (75.03%) + 5.301503353 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.160324e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.375621e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.375621e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.780149 sec - 17,815,433,670 cycles # 3.080 GHz - 43,511,102,764 instructions # 2.44 insn per cycle - 5.785180938 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.422792e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.651217e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.651217e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.093515 sec + 17,258,116,112 cycles:u # 3.371 GHz (75.00%) + 39,040,967 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.00%) + 35,911,867 stalled-cycles-backend:u # 0.21% backend cycles idle (75.00%) + 47,203,584,739 instructions:u # 2.74 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 5.122349856 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.389771e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.650423e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.650423e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.992624 sec - 9,227,327,267 cycles # 3.079 GHz - 21,906,426,544 instructions # 2.37 insn per cycle - 2.997895192 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.953939e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.195964e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.195964e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.805057 sec + 9,229,935,831 cycles:u # 3.260 GHz (74.87%) + 41,364,885 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.88%) + 932,793,246 stalled-cycles-backend:u # 10.11% backend cycles idle (74.90%) + 22,162,738,336 instructions:u # 2.40 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 2.833936052 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.528530e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.865855e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.865855e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.845512 sec - 8,254,984,848 cycles # 2.896 GHz - 15,590,498,904 instructions # 1.89 insn per cycle - 2.850900280 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.413730e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.999080e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.999080e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.515895 sec + 8,211,824,655 cycles:u # 3.230 GHz (74.85%) + 42,295,685 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.85%) + 1,609,403,252 stalled-cycles-backend:u # 19.60% backend cycles idle (74.90%) + 15,547,822,400 instructions:u # 1.89 insn per cycle + # 0.10 stalled cycles per insn (75.07%) + 2.544692512 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.609279e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.018312e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.018312e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.764714 sec - 8,215,374,590 cycles # 2.969 GHz - 15,429,066,515 instructions # 1.88 insn per cycle - 2.770036927 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.648656e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.090784e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.090784e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.731162 sec - 6,615,238,340 cycles # 2.419 GHz - 12,862,797,254 instructions # 1.94 insn per cycle - 2.736410000 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index f9e4000e6d..7fb4b0ecf3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:25:31 +DATE: 2024-03-03_14:03:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.096943e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095054e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.337200e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.581297 sec - 2,416,875,461 cycles # 3.000 GHz - 3,802,904,431 instructions # 1.57 insn per cycle - 0.886522859 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.881152e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.242181e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.006805e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 +TOTAL : 0.384701 sec + 975,692,681 cycles:u # 2.413 GHz (74.26%) + 2,357,292 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.34%) + 4,115,956 stalled-cycles-backend:u # 0.42% backend cycles idle (74.06%) + 1,796,776,398 instructions:u # 1.84 insn per cycle + # 0.00 stalled cycles per insn (76.05%) + 0.433515678 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.237656e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.486670e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.486670e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.444566 sec - 16,726,225,777 cycles # 3.070 GHz - 41,270,625,621 instructions # 2.47 insn per cycle - 5.454849598 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.546587e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.820453e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.820453e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 4.740705 sec + 16,005,689,600 cycles:u # 3.357 GHz (75.00%) + 39,140,113 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.00%) + 24,772,897 stalled-cycles-backend:u # 0.15% backend cycles idle (75.00%) + 44,042,185,394 instructions:u # 2.75 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 4.770282414 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.460514e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.827007e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.827007e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.914617 sec - 8,996,783,237 cycles # 3.081 GHz - 21,210,998,059 instructions # 2.36 insn per cycle - 2.929493898 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.018225e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.336340e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.336340e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.759305 sec + 9,080,357,138 cycles:u # 3.259 GHz (74.94%) + 42,050,430 stalled-cycles-frontend:u # 0.46% frontend cycles idle (75.02%) + 544,746,557 stalled-cycles-backend:u # 6.00% backend cycles idle (75.02%) + 21,614,717,358 instructions:u # 2.38 insn per cycle + # 0.03 stalled cycles per insn (75.02%) + 2.790124475 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.611163e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.022551e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.022551e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.760181 sec - 8,249,336,928 cycles # 2.983 GHz - 15,425,238,678 instructions # 1.87 insn per cycle - 2.778856529 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 3.470116e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.115335e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.115335e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.485521 sec + 8,111,476,454 cycles:u # 3.229 GHz (74.89%) + 42,631,186 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.87%) + 1,607,734,533 stalled-cycles-backend:u # 19.82% backend cycles idle (74.87%) + 15,371,769,451 instructions:u # 1.90 insn per cycle + # 0.10 stalled cycles per insn (74.95%) + 2.516352054 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2524) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.587140e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.018405e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.018405e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.789811 sec - 8,096,556,575 cycles # 2.897 GHz - 15,238,891,903 instructions # 1.88 insn per cycle - 2.804859872 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.644016e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.094854e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.094854e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.735992 sec - 6,623,617,660 cycles # 2.417 GHz - 12,843,079,376 instructions # 1.94 insn per cycle - 2.752411310 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052564145764E-002 -Relative difference = 1.9988585667912256e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index fde060de72..e01df148fd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:03:05 +DATE: 2024-03-03_14:33:18 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.224284e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.181869e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.290244e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.576138 sec - 2,415,755,755 cycles # 3.001 GHz - 3,734,378,655 instructions # 1.55 insn per cycle - 0.864225849 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.886573e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.209532e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.966862e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 +TOTAL : 0.389381 sec + 979,434,085 cycles:u # 2.383 GHz (75.04%) + 2,277,306 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.85%) + 4,390,330 stalled-cycles-backend:u # 0.45% backend cycles idle (73.83%) + 1,816,731,527 instructions:u # 1.85 insn per cycle + # 0.00 stalled cycles per insn (74.28%) + 0.437890631 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.727035e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.251286e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.251286e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.002640 sec - 12,159,409,273 cycles # 3.035 GHz - 32,432,694,101 instructions # 2.67 insn per cycle - 4.008158303 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.929116e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.376620e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.376620e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 3.933142 sec + 13,173,596,053 cycles:u # 3.327 GHz (74.95%) + 38,281,114 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.95%) + 921,861,865 stalled-cycles-backend:u # 7.00% backend cycles idle (74.96%) + 38,070,756,048 instructions:u # 2.89 insn per cycle + # 0.02 stalled cycles per insn (74.97%) + 3.962633268 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039840314887E-002 -Relative difference = 1.244813035273009e-08 +Avg ME (F77/C++) = 1.2828039543819614E-002 +Relative difference = 3.5561191488957804e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.805511e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.765564e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.765564e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.601867 sec - 7,999,882,010 cycles # 3.069 GHz - 18,656,600,340 instructions # 2.33 insn per cycle - 2.607493343 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.477958e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.358625e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.358625e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.484051 sec + 8,104,443,603 cycles:u # 3.227 GHz (74.86%) + 41,987,704 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.86%) + 415,967,627 stalled-cycles-backend:u # 5.13% backend cycles idle (74.95%) + 18,682,517,136 instructions:u # 2.31 insn per cycle + # 0.02 stalled cycles per insn (75.12%) + 2.515458731 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039283704129E-002 -Relative difference = 5.583829420356249e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.939924e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.842069e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.842069e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.492780 sec - 7,427,313,914 cycles # 2.974 GHz - 14,251,086,474 instructions # 1.92 insn per cycle - 2.498394316 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053244447801E-002 -Relative difference = 2.5291823782248813e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 3.856113e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.000169e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.000169e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.306514 sec + 7,472,562,788 cycles:u # 3.202 GHz (75.03%) + 43,082,857 stalled-cycles-frontend:u # 0.58% frontend cycles idle (74.98%) + 1,358,463,269 stalled-cycles-backend:u # 18.18% backend cycles idle (74.98%) + 14,266,218,058 instructions:u # 1.91 insn per cycle + # 0.10 stalled cycles per insn (74.98%) + 2.337985808 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2233) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.004272e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.034488e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.034488e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.444620 sec - 7,299,238,549 cycles # 2.980 GHz - 13,947,633,533 instructions # 1.91 insn per cycle - 2.450212772 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053244447801E-002 -Relative difference = 2.5291823782248813e-07 +Avg ME (F77/C++) = 1.2828053337216261E-002 +Relative difference = 2.601499261602198e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.706121e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.223606e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.223606e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.681955 sec - 6,492,318,128 cycles # 2.417 GHz - 13,422,094,611 instructions # 2.07 insn per cycle - 2.687432186 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052562326775E-002 -Relative difference = 1.997440588685788e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 0d6d3b3db1..685a4c5586 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:03:32 +DATE: 2024-03-03_14:33:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.215876e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.204111e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.337047e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.576922 sec - 2,404,705,116 cycles # 2.985 GHz - 3,758,296,111 instructions # 1.56 insn per cycle - 0.864210592 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=1, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.883051e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.242645e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.010106e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 +TOTAL : 0.386398 sec + 972,456,357 cycles:u # 2.401 GHz (75.26%) + 2,356,298 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.40%) + 4,335,483 stalled-cycles-backend:u # 0.45% backend cycles idle (73.42%) + 1,793,751,584 instructions:u # 1.84 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 0.437765238 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.296714e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.359904e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.359904e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.097656 sec - 9,472,450,742 cycles # 3.053 GHz - 25,268,175,697 instructions # 2.67 insn per cycle - 3.103042436 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.686654e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.635896e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.635896e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 3.015670 sec + 9,959,556,040 cycles:u # 3.273 GHz (74.96%) + 38,732,011 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.03%) + 31,573,728 stalled-cycles-backend:u # 0.32% backend cycles idle (75.03%) + 28,562,544,127 instructions:u # 2.87 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 3.046442990 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039838495897E-002 -Relative difference = 1.2589928273811243e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.079795e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.704088e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.704088e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.415041 sec - 7,164,638,851 cycles # 2.961 GHz - 16,869,197,703 instructions # 2.35 insn per cycle - 2.420723497 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.841583e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.313065e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.313065e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.316521 sec + 7,503,861,433 cycles:u # 3.202 GHz (74.81%) + 38,236,403 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.87%) + 30,212,837 stalled-cycles-backend:u # 0.40% backend cycles idle (75.04%) + 16,951,104,854 instructions:u # 2.26 insn per cycle + # 0.00 stalled cycles per insn (75.08%) + 2.347740758 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.078168e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.319472e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.319472e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.394138 sec - 7,165,321,711 cycles # 2.987 GHz - 13,616,190,038 instructions # 1.90 insn per cycle - 2.399577311 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053220800939E-002 -Relative difference = 2.5107486628541925e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 4.065196e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.526485e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.526485e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.223458 sec + 7,197,899,014 cycles:u # 3.198 GHz (74.79%) + 42,930,070 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.92%) + 349,598,703 stalled-cycles-backend:u # 4.86% backend cycles idle (75.09%) + 13,636,238,011 instructions:u # 1.89 insn per cycle + # 0.03 stalled cycles per insn (75.13%) + 2.254983022 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2064) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.136069e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.411751e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.411751e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.363661 sec - 7,031,964,685 cycles # 2.970 GHz - 13,425,613,371 instructions # 1.91 insn per cycle - 2.369281481 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053220800939E-002 -Relative difference = 2.5107486628541925e-07 +Avg ME (F77/C++) = 1.2828053331759293E-002 +Relative difference = 2.597245327285885e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.811199e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.477443e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.477443e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.592425 sec - 6,321,858,831 cycles # 2.434 GHz - 13,153,560,775 instructions # 2.08 insn per cycle - 2.597985755 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052536860923E-002 -Relative difference = 1.977588895209662e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 4be3e76490..a5f18d3b23 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:26:01 +DATE: 2024-03-03_14:03:32 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.449419e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.301374e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.190967e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.717219 sec - 2,841,227,385 cycles # 2.957 GHz - 4,430,504,412 instructions # 1.56 insn per cycle - 1.049815549 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.318987e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.111965e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.340348e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.504885 sec + 1,317,748,469 cycles:u # 2.492 GHz (74.25%) + 2,348,029 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.33%) + 4,933,092 stalled-cycles-backend:u # 0.37% backend cycles idle (75.11%) + 2,054,494,016 instructions:u # 1.56 insn per cycle + # 0.00 stalled cycles per insn (75.78%) + 0.560665634 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 +Avg ME (F77/CUDA) = 1.2828039901590281E-002 +Relative difference = 7.67145406542181e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.109294e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.297854e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.297854e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.069129 sec - 18,728,354,553 cycles # 3.083 GHz - 44,224,513,518 instructions # 2.36 insn per cycle - 6.079869673 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.233768e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.407841e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.407841e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.830420 sec + 19,739,395,369 cycles:u # 3.367 GHz (75.02%) + 51,696,496 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.03%) + 75,656,656 stalled-cycles-backend:u # 0.38% backend cycles idle (75.03%) + 46,919,398,962 instructions:u # 2.38 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 5.864610691 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 473) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.745615e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.315952e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.315952e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.001256 sec - 12,323,242,096 cycles # 3.075 GHz - 30,917,838,115 instructions # 2.51 insn per cycle - 4.017904894 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.005878e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.551825e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.551825e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.863822 sec + 12,851,798,844 cycles:u # 3.300 GHz (74.97%) + 51,932,860 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.94%) + 1,589,021,560 stalled-cycles-backend:u # 12.36% backend cycles idle (74.96%) + 30,953,526,366 instructions:u # 2.41 insn per cycle + # 0.05 stalled cycles per insn (74.96%) + 3.898615324 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.078908e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.902249e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.902249e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.416443 sec - 10,120,877,504 cycles # 2.958 GHz - 19,374,733,180 instructions # 1.91 insn per cycle - 3.431641491 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.599539e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.429045e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.429045e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.150319 sec + 10,380,428,648 cycles:u # 3.263 GHz (74.89%) + 49,648,911 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.02%) + 872,379,532 stalled-cycles-backend:u # 8.40% backend cycles idle (75.11%) + 19,304,358,873 instructions:u # 1.86 insn per cycle + # 0.05 stalled cycles per insn (75.11%) + 3.185393340 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2101) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.114347e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.979731e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.979731e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.374976 sec - 9,706,052,635 cycles # 2.871 GHz - 18,944,519,271 instructions # 1.95 insn per cycle - 3.395274500 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.874531e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.524823e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.524823e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.760847 sec - 8,409,257,244 cycles # 2.233 GHz - 15,057,436,319 instructions # 1.79 insn per cycle - 3.776930410 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 77001f8935..15421ace75 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:26:35 +DATE: 2024-03-03_14:03:59 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.443987e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.284127e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.143740e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.699538 sec - 2,805,342,043 cycles # 2.999 GHz - 4,414,010,673 instructions # 1.57 insn per cycle - 1.020206687 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.936397e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.601934e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.922350e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 +TOTAL : 0.490032 sec + 1,245,563,795 cycles:u # 2.426 GHz (75.17%) + 2,192,930 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.24%) + 5,709,947 stalled-cycles-backend:u # 0.46% backend cycles idle (75.23%) + 2,036,787,970 instructions:u # 1.64 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 0.545663761 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 +Avg ME (F77/CUDA) = 1.2828039901590284E-002 +Relative difference = 7.67145379496374e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.155620e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.358194e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.358194e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.837265 sec - 18,090,198,997 cycles # 3.097 GHz - 42,472,863,850 instructions # 2.35 insn per cycle - 5.848007644 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.313034e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.511779e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.511779e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.519982 sec + 18,669,891,707 cycles:u # 3.364 GHz (74.93%) + 51,510,445 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.92%) + 64,158,399 stalled-cycles-backend:u # 0.34% backend cycles idle (74.96%) + 44,665,149,420 instructions:u # 2.39 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 5.553260637 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 497) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.786116e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.385279e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.385279e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.920672 sec - 12,137,736,337 cycles # 3.092 GHz - 30,225,042,392 instructions # 2.49 insn per cycle - 3.938311189 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.037716e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.603868e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.603868e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.815384 sec + 12,695,824,770 cycles:u # 3.301 GHz (74.88%) + 51,368,917 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.95%) + 426,406,512 stalled-cycles-backend:u # 3.36% backend cycles idle (75.04%) + 30,185,380,945 instructions:u # 2.38 insn per cycle + # 0.01 stalled cycles per insn (75.04%) + 3.850199439 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.068049e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.882124e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.882124e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.437770 sec - 10,015,371,277 cycles # 2.909 GHz - 19,256,811,213 instructions # 1.92 insn per cycle - 3.454377757 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.623894e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.473036e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.473036e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.130926 sec + 10,287,378,182 cycles:u # 3.253 GHz (75.00%) + 46,386,917 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.96%) + 302,753,328 stalled-cycles-backend:u # 2.94% backend cycles idle (74.85%) + 18,963,256,356 instructions:u # 1.84 insn per cycle + # 0.02 stalled cycles per insn (74.85%) + 3.166792585 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2054) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.207913e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.137874e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.137874e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.235635 sec - 9,645,810,411 cycles # 2.976 GHz - 18,756,051,671 instructions # 1.94 insn per cycle - 3.251774736 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.969792e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.680976e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.680976e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.592139 sec - 8,293,535,644 cycles # 2.305 GHz - 14,979,176,568 instructions # 1.81 insn per cycle - 3.613399615 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 9a5df19d5b..241e4837ec 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:27:08 +DATE: 2024-03-03_14:04:25 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.025930e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.135524e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271935e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.535145 sec - 2,303,454,226 cycles # 2.990 GHz - 3,249,200,622 instructions # 1.41 insn per cycle - 0.848848936 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.792674e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.953494e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.007777e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.394758 sec + 923,327,966 cycles:u # 2.275 GHz (74.36%) + 2,293,308 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.05%) + 5,604,796 stalled-cycles-backend:u # 0.61% backend cycles idle (75.03%) + 1,453,147,578 instructions:u # 1.57 insn per cycle + # 0.00 stalled cycles per insn (74.42%) + 0.449959088 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.185653e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.250591e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.250591e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.903669 sec - 15,175,795,116 cycles # 3.093 GHz - 38,374,949,840 instructions # 2.53 insn per cycle - 4.917105673 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.523861e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.590498e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.590498e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.334949 sec + 14,979,793,943 cycles:u # 3.431 GHz (74.91%) + 9,456,313 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.90%) + 1,486,955,679 stalled-cycles-backend:u # 9.93% backend cycles idle (74.97%) + 38,693,956,907 instructions:u # 2.58 insn per cycle + # 0.04 stalled cycles per insn (75.07%) + 4.369306063 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.662249e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.860778e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.860778e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.968890 sec - 9,101,848,873 cycles # 3.060 GHz - 24,578,505,710 instructions # 2.70 insn per cycle - 2.986159008 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.498105e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.721418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.721418e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.510166 sec + 8,578,472,900 cycles:u # 3.375 GHz (74.82%) + 9,889,349 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.93%) + 492,113,192 stalled-cycles-backend:u # 5.74% backend cycles idle (75.09%) + 24,326,612,305 instructions:u # 2.84 insn per cycle + # 0.02 stalled cycles per insn (75.14%) + 2.545870437 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.728560e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.222175e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.222175e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.936093 sec - 5,474,671,571 cycles # 2.819 GHz - 11,252,385,098 instructions # 2.06 insn per cycle - 1.954008279 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.681670e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.274519e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.274519e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.544546 sec + 5,171,840,561 cycles:u # 3.281 GHz (74.92%) + 8,432,588 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.13%) + 975,939,486 stalled-cycles-backend:u # 18.87% backend cycles idle (75.14%) + 11,476,671,248 instructions:u # 2.22 insn per cycle + # 0.09 stalled cycles per insn (75.14%) + 1.581412468 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.292169e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.895497e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.895497e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.774092 sec - 4,972,729,611 cycles # 2.794 GHz - 10,557,445,760 instructions # 2.12 insn per cycle - 1.789622209 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.894024e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.109310e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.109310e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.799185 sec - 5,395,066,029 cycles # 1.924 GHz - 7,793,871,634 instructions # 1.44 insn per cycle - 2.817161041 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 598396a8e7..7dbee0f112 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,223 +1,175 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:14:07 +DATE: 2024-03-03_14:52:41 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.569533e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.877038e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.877038e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.801549 sec - 3,157,604,220 cycles # 3.025 GHz - 4,827,294,021 instructions # 1.53 insn per cycle - 1.101037847 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.962044e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.790802e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.790802e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.237384 sec + 3,741,729,435 cycles:u # 2.943 GHz (75.04%) + 21,439,104 stalled-cycles-frontend:u # 0.57% frontend cycles idle (75.13%) + 1,158,531,216 stalled-cycles-backend:u # 30.96% backend cycles idle (74.85%) + 3,954,224,434 instructions:u # 1.06 insn per cycle + # 0.29 stalled cycles per insn (74.89%) + 1.301539646 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.171920e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.234476e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.234476e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.008942 sec - 15,497,351,856 cycles # 3.090 GHz - 38,433,512,801 instructions # 2.48 insn per cycle - 5.015755142 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.521503e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.586603e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.586603e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.416052 sec + 15,061,778,809 cycles:u # 3.380 GHz (74.87%) + 10,460,452 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.96%) + 1,458,789,729 stalled-cycles-backend:u # 9.69% backend cycles idle (75.05%) + 38,688,044,179 instructions:u # 2.57 insn per cycle + # 0.04 stalled cycles per insn (75.07%) + 4.459681433 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.610749e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.808660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.808660e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.090616 sec - 9,430,020,802 cycles # 3.049 GHz - 24,763,068,407 instructions # 2.63 insn per cycle - 3.097621879 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.465557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.686838e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.686838e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.613030 sec + 8,709,883,450 cycles:u # 3.282 GHz (75.01%) + 10,315,750 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.01%) + 509,887,377 stalled-cycles-backend:u # 5.85% backend cycles idle (74.99%) + 24,603,742,616 instructions:u # 2.82 insn per cycle + # 0.02 stalled cycles per insn (74.98%) + 2.658578687 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.825746e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.328246e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.328246e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.984017 sec - 5,826,620,771 cycles # 2.928 GHz - 11,538,062,844 instructions # 1.98 insn per cycle - 1.990946794 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 7.590705e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.168333e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.168333e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.657586 sec + 5,373,669,337 cycles:u # 3.157 GHz (74.71%) + 9,036,023 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.85%) + 958,756,649 stalled-cycles-backend:u # 17.84% backend cycles idle (75.09%) + 11,775,474,143 instructions:u # 2.19 insn per cycle + # 0.08 stalled cycles per insn (75.13%) + 1.706319579 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.484023e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.101551e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.101551e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.799262 sec - 5,294,562,816 cycles # 2.933 GHz - 10,843,404,980 instructions # 2.05 insn per cycle - 1.806082483 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.045937e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.276782e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.276782e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.778138 sec - 5,743,518,580 cycles # 2.063 GHz - 8,037,207,687 instructions # 1.40 insn per cycle - 2.785184310 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 977053e874..265c701b52 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,210 +1,165 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:27:17 +DATE: 2024-03-03_15:04:13 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.571348e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154956e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272098e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.699007e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.971161e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.025778e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.617245 sec - 2,532,813,012 cycles # 2.999 GHz - 3,701,870,616 instructions # 1.46 insn per cycle - 0.904006340 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 1.067748 sec + 3,232,507,792 cycles:u # 2.948 GHz (74.39%) + 10,800,000 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.47%) + 1,140,820,368 stalled-cycles-backend:u # 35.29% backend cycles idle (74.75%) + 2,947,382,727 instructions:u # 0.91 insn per cycle + # 0.39 stalled cycles per insn (75.20%) + 1.121100120 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.183394e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.247420e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.247420e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.525527e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.590693e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.590693e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.966854 sec - 15,343,121,883 cycles # 3.087 GHz - 38,390,661,623 instructions # 2.50 insn per cycle - 4.972403311 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.331199 sec + 14,981,765,080 cycles:u # 3.434 GHz (74.88%) + 9,200,653 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.91%) + 1,520,129,866 stalled-cycles-backend:u # 10.15% backend cycles idle (75.00%) + 38,690,269,362 instructions:u # 2.58 insn per cycle + # 0.04 stalled cycles per insn (75.06%) + 4.364994952 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.599283e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.796561e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.796561e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.432769e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.649733e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.649733e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.079495 sec - 9,279,730,828 cycles # 3.010 GHz - 24,577,932,954 instructions # 2.65 insn per cycle - 3.085060857 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.544461 sec + 8,704,350,159 cycles:u # 3.379 GHz (74.87%) + 10,338,759 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.85%) + 534,931,295 stalled-cycles-backend:u # 6.15% backend cycles idle (74.95%) + 24,372,274,267 instructions:u # 2.80 insn per cycle + # 0.02 stalled cycles per insn (75.11%) + 2.578255119 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.908259e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.435116e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.435116e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.677967e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.270492e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.270492e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.937503 sec - 5,654,473,993 cycles # 2.911 GHz - 11,233,989,199 instructions # 1.99 insn per cycle - 1.943141738 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +TOTAL : 1.545617 sec + 5,214,954,602 cycles:u # 3.307 GHz (74.64%) + 9,315,822 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.81%) + 972,093,360 stalled-cycles-backend:u # 18.64% backend cycles idle (75.06%) + 11,473,357,235 instructions:u # 2.20 insn per cycle + # 0.08 stalled cycles per insn (75.15%) + 1.579355171 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.578665e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.217153e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.217153e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.757396 sec - 5,128,637,723 cycles # 2.910 GHz - 10,505,547,256 instructions # 2.05 insn per cycle - 1.762900213 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.070979e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.306684e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.306684e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.739915 sec - 5,558,468,681 cycles # 2.025 GHz - 7,741,606,815 instructions # 1.39 insn per cycle - 2.745378653 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index e5cfc13b3e..e5dc219326 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,212 +1,169 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:20:45 +DATE: 2024-03-03_15:00:28 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.972409e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155179e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272541e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.697938 sec - 2,798,675,219 cycles # 3.021 GHz - 4,376,672,842 instructions # 1.56 insn per cycle - 0.983897382 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.817032e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.967328e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.022046e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.183136 sec + 3,634,374,116 cycles:u # 2.990 GHz (75.05%) + 21,461,083 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.05%) + 1,143,444,081 stalled-cycles-backend:u # 31.46% backend cycles idle (75.00%) + 3,851,624,402 instructions:u # 1.06 insn per cycle + # 0.30 stalled cycles per insn (74.95%) + 1.233526426 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.189575e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.254386e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.254386e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.893907 sec - 15,162,024,600 cycles # 3.096 GHz - 38,372,989,497 instructions # 2.53 insn per cycle - 4.899450957 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.525556e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.591135e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.591135e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.330569 sec + 14,984,543,242 cycles:u # 3.435 GHz (74.88%) + 9,289,687 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.91%) + 1,446,915,645 stalled-cycles-backend:u # 9.66% backend cycles idle (75.00%) + 38,680,440,371 instructions:u # 2.58 insn per cycle + # 0.04 stalled cycles per insn (75.06%) + 4.364535873 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.704548e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.907149e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.907149e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.935182 sec - 9,091,941,153 cycles # 3.094 GHz - 24,577,519,112 instructions # 2.70 insn per cycle - 2.940777194 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.489599e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.715001e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.715001e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.514792 sec + 8,607,437,165 cycles:u # 3.381 GHz (74.72%) + 9,913,991 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.87%) + 505,052,878 stalled-cycles-backend:u # 5.87% backend cycles idle (75.02%) + 24,369,233,220 instructions:u # 2.83 insn per cycle + # 0.02 stalled cycles per insn (75.17%) + 2.548468659 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.938740e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.466662e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.466662e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.869802 sec - 5,458,289,042 cycles # 2.911 GHz - 11,250,961,339 instructions # 2.06 insn per cycle - 1.875881825 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.678183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.266961e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.266961e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.545730 sec + 5,218,044,328 cycles:u # 3.309 GHz (74.65%) + 9,304,576 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.78%) + 959,781,315 stalled-cycles-backend:u # 18.39% backend cycles idle (75.03%) + 11,480,727,731 instructions:u # 2.20 insn per cycle + # 0.08 stalled cycles per insn (75.15%) + 1.579407077 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.493369e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.117845e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.117845e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.719311 sec - 5,034,836,824 cycles # 2.920 GHz - 10,558,271,294 instructions # 2.10 insn per cycle - 1.725057980 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.013824e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.247297e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.247297e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.716839 sec - 5,403,556,568 cycles # 1.987 GHz - 7,794,191,095 instructions # 1.44 insn per cycle - 2.722528243 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 73356b00dd..a9d6608bd7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:27:35 +DATE: 2024-03-03_14:04:47 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.058566e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.139903e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277694e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.538743 sec - 2,297,794,086 cycles # 2.963 GHz - 3,276,125,304 instructions # 1.43 insn per cycle - 0.856267333 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.841177e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.927044e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.980188e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.377611 sec + 871,922,050 cycles:u # 2.173 GHz (74.03%) + 2,265,714 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.12%) + 5,115,843 stalled-cycles-backend:u # 0.59% backend cycles idle (74.98%) + 1,365,035,303 instructions:u # 1.57 insn per cycle + # 0.00 stalled cycles per insn (75.93%) + 0.429796399 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.197217e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.262307e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.262307e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.877526 sec - 15,081,677,651 cycles # 3.089 GHz - 40,100,660,385 instructions # 2.66 insn per cycle - 4.889980594 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.447310e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.508276e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.508276e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.463871 sec + 15,425,766,743 cycles:u # 3.431 GHz (74.93%) + 9,099,706 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.92%) + 26,594,098 stalled-cycles-backend:u # 0.17% backend cycles idle (74.94%) + 39,551,830,467 instructions:u # 2.56 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 4.499242437 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.910252e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.135599e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.135599e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.787478 sec - 8,606,981,244 cycles # 3.082 GHz - 23,670,854,000 instructions # 2.75 insn per cycle - 2.801213189 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.398083e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.614446e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.614446e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.562355 sec + 8,749,116,043 cycles:u # 3.373 GHz (74.99%) + 10,470,436 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.02%) + 1,497,401,215 stalled-cycles-backend:u # 17.11% backend cycles idle (75.02%) + 23,520,129,083 instructions:u # 2.69 insn per cycle + # 0.06 stalled cycles per insn (75.02%) + 2.598149109 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1952) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.287623e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.696089e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.696089e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.088271 sec - 6,101,163,180 cycles # 2.915 GHz - 13,060,965,379 instructions # 2.14 insn per cycle - 2.110411764 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.931928e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.410277e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.410277e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.693161 sec + 5,687,334,776 cycles:u # 3.297 GHz (74.96%) + 9,853,745 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.96%) + 752,383,950 stalled-cycles-backend:u # 13.23% backend cycles idle (75.00%) + 13,198,663,716 instructions:u # 2.32 insn per cycle + # 0.06 stalled cycles per insn (75.00%) + 1.729123364 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.510708e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.955656e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.955656e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.007458 sec - 5,795,313,103 cycles # 2.878 GHz - 12,320,114,352 instructions # 2.13 insn per cycle - 2.035740422 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.559784e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.746127e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.746127e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.054998 sec - 5,836,990,709 cycles # 1.908 GHz - 9,601,704,067 instructions # 1.64 insn per cycle - 3.069883688 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 7ca7ca6f27..bf0bdd420a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:03:58 +DATE: 2024-03-03_14:34:01 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.566149e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156976e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274435e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.520509 sec - 2,251,864,611 cycles # 2.979 GHz - 3,200,076,053 instructions # 1.42 insn per cycle - 0.813049887 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.824404e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.962298e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.016640e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.378356 sec + 915,619,025 cycles:u # 2.271 GHz (75.19%) + 2,407,774 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.60%) + 5,389,487 stalled-cycles-backend:u # 0.59% backend cycles idle (73.96%) + 1,476,869,829 instructions:u # 1.61 insn per cycle + # 0.00 stalled cycles per insn (74.26%) + 0.433727402 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.538728e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.625778e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.625778e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.235724 sec - 13,018,811,907 cycles # 3.070 GHz - 34,384,492,801 instructions # 2.64 insn per cycle - 4.241723051 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.883473e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.970155e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.970155e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.817212 sec + 13,160,873,635 cycles:u # 3.419 GHz (74.86%) + 9,243,267 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.97%) + 387,509,790 stalled-cycles-backend:u # 2.94% backend cycles idle (75.06%) + 35,783,766,947 instructions:u # 2.72 insn per cycle + # 0.01 stalled cycles per insn (75.06%) + 3.851933365 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.065411e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.209741e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.209741e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.527791 sec - 10,618,068,276 cycles # 3.005 GHz - 24,006,297,751 instructions # 2.26 insn per cycle - 3.533644608 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.431790e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.652319e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.652319e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.545488 sec + 8,688,076,238 cycles:u # 3.371 GHz (74.90%) + 9,841,654 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.89%) + 2,450,051,747 stalled-cycles-backend:u # 28.20% backend cycles idle (74.86%) + 21,908,081,719 instructions:u # 2.52 insn per cycle + # 0.11 stalled cycles per insn (74.97%) + 2.581714908 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.845204e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.186466e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.186466e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.268558 sec - 6,594,099,256 cycles # 2.900 GHz - 12,400,446,525 instructions # 1.88 insn per cycle - 2.274329127 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.148118e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.537652e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.537652e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.142175 sec - 6,250,159,272 cycles # 2.911 GHz - 11,574,474,977 instructions # 1.85 insn per cycle - 2.148019416 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.817650e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.279825e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.279825e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.718734 sec + 5,777,821,529 cycles:u # 3.300 GHz (74.90%) + 8,837,696 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.91%) + 1,463,068,323 stalled-cycles-backend:u # 25.32% backend cycles idle (74.89%) + 12,086,003,846 instructions:u # 2.09 insn per cycle + # 0.12 stalled cycles per insn (74.89%) + 1.755239088 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3046) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.139590e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.381511e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.381511e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.637824 sec - 5,343,225,675 cycles # 2.022 GHz - 9,294,792,947 instructions # 1.74 insn per cycle - 2.643638198 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +------------------------------------------------------------------------- +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 6740b658ab..c1d50feda3 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:04:25 +DATE: 2024-03-03_14:34:23 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.563128e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.158314e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275634e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.525125 sec - 2,266,508,632 cycles # 2.999 GHz - 3,227,683,893 instructions # 1.42 insn per cycle - 0.815560561 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.845504e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.929444e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.982727e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.376878 sec + 859,889,632 cycles:u # 2.142 GHz (74.88%) + 2,101,172 stalled-cycles-frontend:u # 0.24% frontend cycles idle (76.07%) + 4,913,440 stalled-cycles-backend:u # 0.57% backend cycles idle (75.14%) + 1,408,843,301 instructions:u # 1.64 insn per cycle + # 0.00 stalled cycles per insn (74.34%) + 0.429973828 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.686393e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.784184e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.784184e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.008193 sec - 12,350,315,150 cycles # 3.077 GHz - 35,037,181,267 instructions # 2.84 insn per cycle - 4.014100641 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.237547e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.346977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.346977e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.418474 sec + 11,732,063,016 cycles:u # 3.401 GHz (74.96%) + 9,422,894 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.98%) + 20,017,604 stalled-cycles-backend:u # 0.17% backend cycles idle (74.97%) + 35,763,295,048 instructions:u # 3.05 insn per cycle + # 0.00 stalled cycles per insn (74.96%) + 3.453721586 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.126314e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.271590e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.271590e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.458899 sec - 10,688,048,117 cycles # 3.085 GHz - 23,082,662,787 instructions # 2.16 insn per cycle - 3.464737128 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.827548e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.089939e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.089939e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.352011 sec + 8,010,475,735 cycles:u # 3.360 GHz (74.85%) + 9,398,588 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.02%) + 1,674,206,966 stalled-cycles-backend:u # 20.90% backend cycles idle (75.18%) + 21,206,260,334 instructions:u # 2.65 insn per cycle + # 0.08 stalled cycles per insn (75.18%) + 2.388527408 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.065386e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.447820e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.447820e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.175532 sec - 6,167,789,524 cycles # 2.829 GHz - 11,956,365,830 instructions # 1.94 insn per cycle - 2.181490352 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.596714e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.178513e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.178513e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.560107 sec + 5,213,460,094 cycles:u # 3.273 GHz (74.92%) + 9,518,486 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.93%) + 731,122,291 stalled-cycles-backend:u # 14.02% backend cycles idle (74.90%) + 11,424,220,646 instructions:u # 2.19 insn per cycle + # 0.06 stalled cycles per insn (74.92%) + 1.601201132 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2354) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.355284e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.776167e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.776167e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.062589 sec - 6,012,687,929 cycles # 2.908 GHz - 11,129,506,913 instructions # 1.85 insn per cycle - 2.068524285 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.234665e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.489644e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.489644e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.581777 sec - 5,215,223,845 cycles # 2.016 GHz - 9,019,923,506 instructions # 1.73 insn per cycle - 2.587755549 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 3164378b7a..ec9c2640e1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:28:04 +DATE: 2024-03-03_14:05:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.210726e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.585567e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.966482e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.485254 sec - 2,068,141,298 cycles # 2.904 GHz - 2,916,142,359 instructions # 1.41 insn per cycle - 0.784434250 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.576605e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.946758e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.111159e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 +TOTAL : 0.324622 sec + 772,289,751 cycles:u # 2.244 GHz (73.84%) + 2,245,044 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.82%) + 4,522,934 stalled-cycles-backend:u # 0.59% backend cycles idle (75.00%) + 1,308,126,073 instructions:u # 1.69 insn per cycle + # 0.00 stalled cycles per insn (75.21%) + 0.373595997 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.313091e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.389644e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.389644e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.621612 sec - 14,026,409,554 cycles # 3.032 GHz - 38,341,238,705 instructions # 2.73 insn per cycle - 4.632085783 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.996944e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.087796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.087796e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.642357 sec + 12,627,649,362 cycles:u # 3.440 GHz (74.94%) + 7,521,215 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.85%) + 954,432,603 stalled-cycles-backend:u # 7.56% backend cycles idle (74.85%) + 37,135,203,641 instructions:u # 2.94 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 3.672981701 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.217740e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.647077e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.647077e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.094155 sec - 6,477,656,873 cycles # 3.085 GHz - 15,815,714,256 instructions # 2.44 insn per cycle - 2.109661469 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.308088e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.778711e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.778711e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.805701 sec + 6,218,278,603 cycles:u # 3.393 GHz (74.62%) + 4,713,384 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.93%) + 1,929,137,521 stalled-cycles-backend:u # 31.02% backend cycles idle (75.13%) + 15,189,730,910 instructions:u # 2.44 insn per cycle + # 0.13 stalled cycles per insn (75.13%) + 1.837265834 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.558089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.098648e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.098648e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.180439 sec - 3,464,791,228 cycles # 2.924 GHz - 7,594,553,534 instructions # 2.19 insn per cycle - 1.196926932 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.222825e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.379981e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.379981e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.000951 sec + 3,394,012,608 cycles:u # 3.300 GHz (74.63%) + 8,001,044 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.88%) + 1,047,207,876 stalled-cycles-backend:u # 30.85% backend cycles idle (75.11%) + 7,668,224,582 instructions:u # 2.26 insn per cycle + # 0.14 stalled cycles per insn (75.11%) + 1.032065308 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.028669e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.195924e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.195924e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.103361 sec - 3,253,544,502 cycles # 2.935 GHz - 7,202,500,133 instructions # 2.21 insn per cycle - 1.115792553 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.586127e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.450667e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.450667e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.467307 sec - 3,062,229,633 cycles # 2.079 GHz - 5,834,823,887 instructions # 1.91 insn per cycle - 1.480044473 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index b32abcb3fe..2b9a9d6e5c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,223 +1,175 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:14:35 +DATE: 2024-03-03_14:53:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.139226e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.486374e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.486374e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.665285 sec - 2,679,931,908 cycles # 3.001 GHz - 4,173,181,221 instructions # 1.56 insn per cycle - 0.950193790 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.457170e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.070824e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.070824e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 +TOTAL : 1.158557 sec + 3,553,635,129 cycles:u # 2.996 GHz (74.66%) + 21,028,695 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.10%) + 1,107,067,719 stalled-cycles-backend:u # 31.15% backend cycles idle (75.11%) + 3,881,698,330 instructions:u # 1.09 insn per cycle + # 0.29 stalled cycles per insn (75.01%) + 1.213936904 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.339175e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.415593e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.415593e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.608146 sec - 14,198,803,048 cycles # 3.078 GHz - 38,383,841,480 instructions # 2.70 insn per cycle - 4.614561058 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.995627e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.086783e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.086783e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.683431 sec + 12,659,259,788 cycles:u # 3.407 GHz (75.03%) + 7,459,048 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.03%) + 1,249,468,881 stalled-cycles-backend:u # 9.87% backend cycles idle (75.03%) + 37,093,663,641 instructions:u # 2.93 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 3.717841479 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.150361e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.574288e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.574288e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.164951 sec - 6,682,648,138 cycles # 3.079 GHz - 16,095,511,662 instructions # 2.41 insn per cycle - 2.171478460 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.311512e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.745050e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.745050e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.850834 sec + 6,221,194,518 cycles:u # 3.304 GHz (74.94%) + 7,340,758 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.97%) + 1,907,140,469 stalled-cycles-backend:u # 30.66% backend cycles idle (74.98%) + 15,554,035,131 instructions:u # 2.50 insn per cycle + # 0.12 stalled cycles per insn (74.95%) + 1.887022954 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.377335e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.075060e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.075060e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.245724 sec - 3,655,872,382 cycles # 2.921 GHz - 7,830,960,228 instructions # 2.14 insn per cycle - 1.252058919 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.209800e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.363451e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.363451e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.055288 sec + 3,424,597,828 cycles:u # 3.148 GHz (75.00%) + 7,657,748 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.00%) + 1,064,027,342 stalled-cycles-backend:u # 31.07% backend cycles idle (75.06%) + 7,991,108,228 instructions:u # 2.33 insn per cycle + # 0.13 stalled cycles per insn (75.05%) + 1.091443231 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.884024e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.146718e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.146718e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.193275 sec - 3,439,455,837 cycles # 2.869 GHz - 7,440,735,686 instructions # 2.16 insn per cycle - 1.199824293 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.445766e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.274506e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.274506e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.539244 sec - 3,276,504,779 cycles # 2.121 GHz - 6,089,433,455 instructions # 1.86 insn per cycle - 1.545785864 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 1418229a2f..a3128f7500 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,210 +1,165 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:27:44 +DATE: 2024-03-03_15:04:36 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.472574e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.636713e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.962164e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.558880 sec - 2,364,095,478 cycles # 3.003 GHz - 3,484,344,192 instructions # 1.47 insn per cycle - 0.845198156 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.851869e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.945347e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.107629e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.014251 sec + 3,108,031,718 cycles:u # 2.996 GHz (74.99%) + 10,712,388 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.43%) + 1,156,197,374 stalled-cycles-backend:u # 37.20% backend cycles idle (75.34%) + 2,767,536,694 instructions:u # 0.89 insn per cycle + # 0.42 stalled cycles per insn (74.96%) + 1.060791796 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.358072e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.436073e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.436073e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.995904e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.086251e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.086251e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.585598 sec - 14,172,267,813 cycles # 3.088 GHz - 38,370,669,897 instructions # 2.71 insn per cycle - 4.590984697 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.644384 sec + 12,634,503,228 cycles:u # 3.441 GHz (74.95%) + 7,148,826 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.94%) + 1,182,578,195 stalled-cycles-backend:u # 9.36% backend cycles idle (74.85%) + 37,144,398,934 instructions:u # 2.94 insn per cycle + # 0.03 stalled cycles per insn (74.86%) + 3.673697809 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.211957e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.640936e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.640936e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.148796 sec - 6,634,619,629 cycles # 3.081 GHz - 15,827,825,218 instructions # 2.39 insn per cycle - 2.154083020 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.142222e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.546841e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.546841e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.850729 sec + 6,338,703,028 cycles:u # 3.376 GHz (74.86%) + 7,177,415 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.68%) + 2,026,441,941 stalled-cycles-backend:u # 31.97% backend cycles idle (74.73%) + 15,242,487,970 instructions:u # 2.40 insn per cycle + # 0.13 stalled cycles per insn (75.13%) + 1.879541975 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.547921e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095970e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.095970e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.236002 sec - 3,624,228,310 cycles # 2.921 GHz - 7,577,923,207 instructions # 2.09 insn per cycle - 1.241371528 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.223659e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.380802e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.380802e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 0.999997 sec + 3,369,007,067 cycles:u # 3.281 GHz (75.08%) + 7,192,520 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.08%) + 1,049,602,344 stalled-cycles-backend:u # 31.15% backend cycles idle (75.08%) + 7,677,145,813 instructions:u # 2.28 insn per cycle + # 0.14 stalled cycles per insn (74.70%) + 1.028886621 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.019099e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183109e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183109e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.166800 sec - 3,412,475,771 cycles # 2.913 GHz - 7,154,107,852 instructions # 2.10 insn per cycle - 1.172143118 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.590832e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.447342e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.447342e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.519807 sec - 3,228,336,001 cycles # 2.118 GHz - 5,784,936,071 instructions # 1.79 insn per cycle - 1.525231071 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index d1c301e36a..149b294a79 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,212 +1,169 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:21:13 +DATE: 2024-03-03_15:00:51 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.521212e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.620937e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.942141e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.625420 sec - 2,414,961,393 cycles # 2.854 GHz - 3,791,061,685 instructions # 1.57 insn per cycle - 0.904442863 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.243357e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.926144e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.087434e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 +TOTAL : 1.161597 sec + 3,601,612,751 cycles:u # 3.026 GHz (75.13%) + 21,057,800 stalled-cycles-frontend:u # 0.58% frontend cycles idle (75.19%) + 1,172,309,632 stalled-cycles-backend:u # 32.55% backend cycles idle (75.19%) + 3,776,445,937 instructions:u # 1.05 insn per cycle + # 0.31 stalled cycles per insn (75.22%) + 1.209122949 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.328946e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.404018e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.404018e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.586154 sec - 14,183,213,679 cycles # 3.090 GHz - 38,341,040,102 instructions # 2.70 insn per cycle - 4.591510537 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.998981e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.089526e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.089526e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.639809 sec + 12,626,040,265 cycles:u # 3.443 GHz (74.92%) + 7,470,036 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.93%) + 803,774,158 stalled-cycles-backend:u # 6.37% backend cycles idle (74.93%) + 37,156,129,537 instructions:u # 2.94 insn per cycle + # 0.02 stalled cycles per insn (74.94%) + 3.669673012 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.242078e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.670922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.670922e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.084805 sec - 6,467,654,599 cycles # 3.095 GHz - 15,814,952,627 instructions # 2.45 insn per cycle - 2.090234852 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.135844e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.545353e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.545353e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.852475 sec + 6,354,804,966 cycles:u # 3.381 GHz (74.89%) + 7,176,488 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.89%) + 2,019,535,309 stalled-cycles-backend:u # 31.78% backend cycles idle (74.90%) + 15,230,071,652 instructions:u # 2.40 insn per cycle + # 0.13 stalled cycles per insn (74.92%) + 1.881704419 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.553311e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096092e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.096092e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.181028 sec - 3,453,301,700 cycles # 2.913 GHz - 7,593,575,205 instructions # 2.20 insn per cycle - 1.186225517 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.222623e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.379871e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.379871e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.001076 sec + 3,380,607,908 cycles:u # 3.289 GHz (74.76%) + 7,784,429 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.10%) + 1,049,808,694 stalled-cycles-backend:u # 31.05% backend cycles idle (75.10%) + 7,654,634,643 instructions:u # 2.26 insn per cycle + # 0.14 stalled cycles per insn (75.11%) + 1.029875459 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.023252e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.188398e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.188398e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.108864 sec - 3,247,038,827 cycles # 2.916 GHz - 7,202,168,264 instructions # 2.22 insn per cycle - 1.114391762 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.596256e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.449431e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.449431e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.464294 sec - 3,059,603,183 cycles # 2.083 GHz - 5,833,854,527 instructions # 1.91 insn per cycle - 1.469681735 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index adc2ed2114..913244345c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:28:27 +DATE: 2024-03-03_14:05:30 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.323457e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.629602e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.019308e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.480923 sec - 2,116,431,851 cycles # 3.003 GHz - 3,022,655,895 instructions # 1.43 insn per cycle - 0.777218279 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.481980e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.117565e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.300799e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 +TOTAL : 0.323751 sec + 784,351,516 cycles:u # 2.273 GHz (74.32%) + 2,192,770 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.45%) + 4,264,005 stalled-cycles-backend:u # 0.54% backend cycles idle (75.47%) + 1,367,200,629 instructions:u # 1.74 insn per cycle + # 0.00 stalled cycles per insn (74.27%) + 0.376841355 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.299655e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.373045e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.373045e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.644587 sec - 14,360,257,758 cycles # 3.089 GHz - 39,833,716,550 instructions # 2.77 insn per cycle - 4.652300252 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.982273e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.072748e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.072748e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.660343 sec + 12,707,203,346 cycles:u # 3.446 GHz (74.89%) + 7,110,827 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.00%) + 11,876,317 stalled-cycles-backend:u # 0.09% backend cycles idle (75.05%) + 37,479,190,658 instructions:u # 2.95 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 3.690309537 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199028000236 -Relative difference = 4.790961076489297e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.819246e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.374211e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.374211e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.888755 sec - 5,601,188,109 cycles # 2.957 GHz - 15,285,931,975 instructions # 2.73 insn per cycle - 1.901754882 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.302120e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.884122e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.884122e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.579376 sec + 5,415,260,274 cycles:u # 3.370 GHz (74.78%) + 7,723,910 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.03%) + 1,444,796,820 stalled-cycles-backend:u # 26.68% backend cycles idle (75.11%) + 15,194,831,822 instructions:u # 2.81 insn per cycle + # 0.10 stalled cycles per insn (75.12%) + 1.610679733 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.809980e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.511061e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.511061e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.623137 sec - 4,755,173,593 cycles # 2.919 GHz - 9,735,141,159 instructions # 2.05 insn per cycle - 1.639641207 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182108197361 -Relative difference = 1.0391259163456515e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 8.906569e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.710449e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.710449e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.320165 sec + 4,487,451,290 cycles:u # 3.332 GHz (75.15%) + 7,209,142 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.10%) + 1,668,263,971 stalled-cycles-backend:u # 37.18% backend cycles idle (75.06%) + 9,805,752,185 instructions:u # 2.19 insn per cycle + # 0.17 stalled cycles per insn (74.77%) + 1.351069736 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3734) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.976796e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.708401e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.708401e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.586631 sec - 4,632,931,570 cycles # 2.912 GHz - 9,326,747,974 instructions # 2.01 insn per cycle - 1.599475417 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182108197361 -Relative difference = 1.0391259163456515e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186428369954 +Relative difference = 1.7604478492421832e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.246902e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.812329e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.812329e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.762945 sec - 3,668,593,409 cycles # 2.074 GHz - 7,034,535,336 instructions # 1.92 insn per cycle - 1.779301540 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2605) (512y: 12) (512z: 2221) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183459779248 -Relative difference = 1.7053177021099307e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 82aee2242c..b6180da33a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:04:53 +DATE: 2024-03-03_14:34:44 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.193238e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.649659e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.969705e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.478757 sec - 2,104,839,063 cycles # 2.996 GHz - 2,995,662,279 instructions # 1.42 insn per cycle - 0.760483148 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.326362e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.955338e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.118530e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 +TOTAL : 0.325621 sec + 779,266,234 cycles:u # 2.250 GHz (74.75%) + 2,296,626 stalled-cycles-frontend:u # 0.29% frontend cycles idle (73.21%) + 4,232,127 stalled-cycles-backend:u # 0.54% backend cycles idle (73.21%) + 1,288,380,090 instructions:u # 1.65 insn per cycle + # 0.00 stalled cycles per insn (75.34%) + 0.377748048 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.482809e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.574079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.574079e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.311067 sec - 12,598,770,011 cycles # 2.919 GHz - 34,372,549,657 instructions # 2.73 insn per cycle - 4.316594695 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.213545e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.317700e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.317700e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.406405 sec + 11,820,211,699 cycles:u # 3.442 GHz (74.86%) + 6,860,688 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.93%) + 1,730,547,263 stalled-cycles-backend:u # 14.64% backend cycles idle (75.04%) + 34,233,034,389 instructions:u # 2.90 insn per cycle + # 0.05 stalled cycles per insn (75.08%) + 3.436615248 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199094356969 -Relative difference = 4.463890496342449e-08 +Avg ME (F77/C++) = 2.0288199088536203 +Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.536780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.027176e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.027176e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.978899 sec - 6,105,197,866 cycles # 3.078 GHz - 14,859,942,037 instructions # 2.43 insn per cycle - 1.984598314 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.214009e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.786074e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.786074e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.597296 sec + 5,453,738,916 cycles:u # 3.356 GHz (74.89%) + 7,219,487 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.90%) + 2,066,216,281 stalled-cycles-backend:u # 37.89% backend cycles idle (74.69%) + 14,675,976,525 instructions:u # 2.69 insn per cycle + # 0.14 stalled cycles per insn (74.74%) + 1.628491271 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193803280592 -Relative difference = 1.8746278463897685e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198769558221 +Relative difference = 6.06481491495597e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.439196e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.305375e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.305375e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.494763 sec - 4,316,279,907 cycles # 2.878 GHz - 9,028,948,283 instructions # 2.09 insn per cycle - 1.500523975 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181999931112 -Relative difference = 9.857617164523888e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 9.357766e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.025050e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.025050e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.264737 sec + 4,300,837,383 cycles:u # 3.327 GHz (74.68%) + 7,671,605 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.68%) + 1,661,680,380 stalled-cycles-backend:u # 38.64% backend cycles idle (74.98%) + 9,050,907,731 instructions:u # 2.10 insn per cycle + # 0.18 stalled cycles per insn (75.25%) + 1.296250495 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4485) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.366245e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.235578e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.235578e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.509333 sec - 4,207,142,397 cycles # 2.778 GHz - 8,663,183,236 instructions # 2.06 insn per cycle - 1.515104262 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181999931112 -Relative difference = 9.857617164523888e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186752004549 +Relative difference = 1.6009291367898262e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.816959e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.308753e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.308753e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.886655 sec - 3,832,564,290 cycles # 2.026 GHz - 7,807,000,610 instructions # 2.04 insn per cycle - 1.892395760 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4419) (512y: 0) (512z: 2556) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183246739209 -Relative difference = 1.6003107281264138e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index dda1db1b3c..a98059d056 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:05:16 +DATE: 2024-03-03_14:35:04 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.270822e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.690662e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.026451e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.478497 sec - 2,092,584,267 cycles # 2.987 GHz - 2,982,481,806 instructions # 1.43 insn per cycle - 0.759974164 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.680623e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.114696e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.297874e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 +TOTAL : 0.322860 sec + 733,848,024 cycles:u # 2.130 GHz (75.93%) + 1,991,309 stalled-cycles-frontend:u # 0.27% frontend cycles idle (76.65%) + 4,436,940 stalled-cycles-backend:u # 0.60% backend cycles idle (74.74%) + 1,320,496,964 instructions:u # 1.80 insn per cycle + # 0.00 stalled cycles per insn (74.03%) + 0.374386065 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.703982e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.806761e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.806761e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.962914 sec - 11,745,545,496 cycles # 2.960 GHz - 35,108,793,810 instructions # 2.99 insn per cycle - 3.968579892 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.490256e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.613448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.613448e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.147954 sec + 10,917,586,256 cycles:u # 3.438 GHz (74.84%) + 6,884,856 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.96%) + 125,728,239 stalled-cycles-backend:u # 1.15% backend cycles idle (75.06%) + 35,430,594,123 instructions:u # 3.25 insn per cycle + # 0.00 stalled cycles per insn (75.07%) + 3.178248752 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199094356969 -Relative difference = 4.463890496342449e-08 +Avg ME (F77/C++) = 2.0288199088536203 +Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.697555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.224866e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.224866e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.925244 sec - 5,962,598,726 cycles # 3.089 GHz - 14,469,931,867 instructions # 2.43 insn per cycle - 1.931094914 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.802344e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.483665e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.483665e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.486921 sec + 5,089,368,535 cycles:u # 3.360 GHz (74.69%) + 7,103,657 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.84%) + 1,342,600,549 stalled-cycles-backend:u # 26.38% backend cycles idle (75.11%) + 14,067,345,434 instructions:u # 2.76 insn per cycle + # 0.10 stalled cycles per insn (75.18%) + 1.518672718 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193583255634 -Relative difference = 1.7661780742548925e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198892958462 +Relative difference = 5.4565783974899003e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.546151e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.447291e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.447291e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.475701 sec - 4,155,772,808 cycles # 2.809 GHz - 8,874,967,057 instructions # 2.14 insn per cycle - 1.481449825 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.023520e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.131303e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.131303e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.168445 sec + 3,961,187,521 cycles:u # 3.311 GHz (74.64%) + 7,048,970 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.73%) + 1,467,403,847 stalled-cycles-backend:u # 37.04% backend cycles idle (75.06%) + 8,611,214,037 instructions:u # 2.17 insn per cycle + # 0.17 stalled cycles per insn (75.27%) + 1.200246018 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3406) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.932743e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.882289e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.882289e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.405788 sec - 4,123,527,517 cycles # 2.923 GHz - 8,411,119,259 instructions # 2.04 insn per cycle - 1.411551419 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186836987734 +Relative difference = 1.559041129563128e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.930692e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.444813e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.444813e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.851731 sec - 3,787,634,254 cycles # 2.040 GHz - 7,699,934,932 instructions # 2.03 insn per cycle - 1.857323010 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3435) (512y: 0) (512z: 2108) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183204829693 -Relative difference = 1.5796536184903122e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 9748a5aab4..7906350f50 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:28:51 +DATE: 2024-03-03_14:05:50 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.029545e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.136839e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273391e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.526886 sec - 2,307,341,508 cycles # 3.024 GHz - 3,271,429,537 instructions # 1.42 insn per cycle - 0.836809323 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.858100e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.024750e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.080805e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.376752 sec + 893,024,523 cycles:u # 2.224 GHz (74.23%) + 2,244,421 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.02%) + 5,188,348 stalled-cycles-backend:u # 0.58% backend cycles idle (76.10%) + 1,372,887,375 instructions:u # 1.54 insn per cycle + # 0.00 stalled cycles per insn (75.13%) + 0.429724861 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/CUDA) = 2.0288063423243869 +Relative difference = 3.241686434838304e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.174399e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.238464e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.238464e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.926720 sec - 15,303,062,403 cycles # 3.103 GHz - 38,574,821,235 instructions # 2.52 insn per cycle - 4.935986004 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.465279e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.528277e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.528277e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.433247 sec + 15,327,724,629 cycles:u # 3.433 GHz (74.93%) + 9,891,067 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.92%) + 1,296,374,959 stalled-cycles-backend:u # 8.46% backend cycles idle (74.92%) + 39,274,027,041 instructions:u # 2.56 insn per cycle + # 0.03 stalled cycles per insn (75.02%) + 4.467582066 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.750432e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.964332e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.964332e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.903163 sec - 8,984,859,488 cycles # 3.089 GHz - 24,224,163,348 instructions # 2.70 insn per cycle - 2.918366508 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.423932e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.642478e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.642478e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.549029 sec + 8,692,877,164 cycles:u # 3.369 GHz (74.91%) + 9,320,788 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.92%) + 713,069,369 stalled-cycles-backend:u # 8.20% backend cycles idle (74.90%) + 24,130,962,669 instructions:u # 2.78 insn per cycle + # 0.03 stalled cycles per insn (74.91%) + 2.584633340 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.977342e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.518236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.518236e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.860423 sec - 5,396,289,064 cycles # 2.891 GHz - 11,276,510,611 instructions # 2.09 insn per cycle - 1.875091896 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.864236e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.484768e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.484768e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.513255 sec + 5,092,010,127 cycles:u # 3.296 GHz (74.74%) + 8,390,120 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.67%) + 559,300,803 stalled-cycles-backend:u # 10.98% backend cycles idle (74.93%) + 11,400,119,673 instructions:u # 2.24 insn per cycle + # 0.05 stalled cycles per insn (75.15%) + 1.549217009 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2451) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.792892e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.469147e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.469147e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.648151 sec - 4,836,682,110 cycles # 2.924 GHz - 10,524,586,299 instructions # 2.18 insn per cycle - 1.662467551 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.224142e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.479514e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.479514e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.587933 sec - 5,228,382,592 cycles # 2.016 GHz - 7,603,380,674 instructions # 1.45 insn per cycle - 2.604403134 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 4c3bdeb3a7..ed6d72052a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,210 +1,168 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:29:18 +DATE: 2024-03-03_14:06:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.025642e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.140563e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276898e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529654 sec - 2,293,467,091 cycles # 2.992 GHz - 3,241,408,242 instructions # 1.41 insn per cycle - 0.836485234 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.850594e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.923234e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.976389e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 +TOTAL : 0.376788 sec + 888,926,488 cycles:u # 2.214 GHz (74.23%) + 2,149,952 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.13%) + 5,354,153 stalled-cycles-backend:u # 0.60% backend cycles idle (74.13%) + 1,398,211,851 instructions:u # 1.57 insn per cycle + # 0.00 stalled cycles per insn (74.89%) + 0.430137061 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/CUDA) = 2.0288063423243869 +Relative difference = 3.241686434838304e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.144775e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.207356e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.207356e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.994421 sec - 15,338,753,655 cycles # 3.068 GHz - 40,369,233,372 instructions # 2.63 insn per cycle - 5.002383718 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.442637e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.503281e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.503281e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.472354 sec + 15,433,122,317 cycles:u # 3.426 GHz (74.96%) + 9,252,197 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) + 132,124,539 stalled-cycles-backend:u # 0.86% backend cycles idle (74.98%) + 40,169,887,469 instructions:u # 2.60 insn per cycle + # 0.00 stalled cycles per insn (74.97%) + 4.507625294 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.003325e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.239627e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.239627e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.723159 sec - 8,478,435,163 cycles # 3.107 GHz - 23,253,497,249 instructions # 2.74 insn per cycle - 2.738604338 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.490737e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.715926e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.715926e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.513977 sec + 8,592,144,959 cycles:u # 3.375 GHz (74.88%) + 10,115,550 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.88%) + 846,577,328 stalled-cycles-backend:u # 9.85% backend cycles idle (74.86%) + 23,518,270,139 instructions:u # 2.74 insn per cycle + # 0.04 stalled cycles per insn (75.01%) + 2.549704076 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.181118e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.571113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.571113e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.127824 sec - 6,241,547,842 cycles # 2.925 GHz - 12,962,413,577 instructions # 2.08 insn per cycle - 2.144515260 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.818297e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.281601e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.281601e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.717535 sec + 5,792,506,610 cycles:u # 3.311 GHz (74.66%) + 10,237,498 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.66%) + 739,803,153 stalled-cycles-backend:u # 12.77% backend cycles idle (74.87%) + 13,069,387,543 instructions:u # 2.26 insn per cycle + # 0.06 stalled cycles per insn (75.10%) + 1.753459731 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.322331e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.729304e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.729304e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.074458 sec - 5,923,278,346 cycles # 2.853 GHz - 12,242,730,346 instructions # 2.07 insn per cycle - 2.086429072 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.899734e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.116034e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.116034e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.794263 sec - 5,618,790,292 cycles # 2.007 GHz - 8,743,459,975 instructions # 1.56 insn per cycle - 2.808786612 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index c4c4bff630..5dc4677d13 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:29:46 +DATE: 2024-03-03_14:06:34 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.473707e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.045050e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.061478e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.463329 sec - 2,069,832,304 cycles # 3.002 GHz - 2,918,096,235 instructions # 1.41 insn per cycle - 0.772559551 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.902087e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.056700e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.064313e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 +TOTAL : 0.413827 sec + 1,039,031,182 cycles:u # 2.496 GHz (73.86%) + 2,272,214 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.06%) + 5,009,677 stalled-cycles-backend:u # 0.48% backend cycles idle (75.09%) + 1,499,486,038 instructions:u # 1.44 insn per cycle + # 0.00 stalled cycles per insn (75.07%) + 0.461232645 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.045387e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.319438e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.336268e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.608947 sec - 2,562,374,732 cycles # 3.012 GHz - 3,879,371,783 instructions # 1.51 insn per cycle - 0.910123971 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.637828e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.844690e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.850403e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.621274 sec + 1,699,917,764 cycles:u # 2.621 GHz (75.45%) + 2,260,625 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.35%) + 4,823,180 stalled-cycles-backend:u # 0.28% backend cycles idle (75.59%) + 2,014,674,020 instructions:u # 1.19 insn per cycle + # 0.00 stalled cycles per insn (74.95%) + 0.674327003 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.585844e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.598254e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.598254e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.359535 sec - 19,687,428,773 cycles # 3.094 GHz - 59,604,296,849 instructions # 3.03 insn per cycle - 6.365859123 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.946395e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.958721e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.958721e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.584711 sec + 19,628,415,309 cycles:u # 3.501 GHz (74.97%) + 2,431,313 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) + 3,361,707,387 stalled-cycles-backend:u # 17.13% backend cycles idle (75.04%) + 57,925,297,511 instructions:u # 2.95 insn per cycle + # 0.06 stalled cycles per insn (75.04%) + 5.609734489 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.691737e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.735631e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.735631e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.515479 sec - 10,373,655,779 cycles # 2.948 GHz - 30,676,465,519 instructions # 2.96 insn per cycle - 3.528584808 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.104957e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.156893e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.156893e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.707312 sec + 9,533,927,731 cycles:u # 3.492 GHz (74.88%) + 2,474,777 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.82%) + 2,355,567,040 stalled-cycles-backend:u # 24.71% backend cycles idle (74.96%) + 29,949,411,318 instructions:u # 3.14 insn per cycle + # 0.08 stalled cycles per insn (75.09%) + 2.736384477 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.754839e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.932602e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.932602e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.702212 sec - 4,885,421,396 cycles # 2.863 GHz - 11,020,224,832 instructions # 2.26 insn per cycle - 1.717667988 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.260863e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.283029e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.283029e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.322910 sec + 4,691,971,408 cycles:u # 3.487 GHz (74.89%) + 2,333,999 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.03%) + 1,527,580,422 stalled-cycles-backend:u # 32.56% backend cycles idle (75.03%) + 11,213,604,407 instructions:u # 2.39 insn per cycle + # 0.14 stalled cycles per insn (75.03%) + 1.349256507 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.095884e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.117707e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.117707e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.517268 sec - 4,368,757,303 cycles # 2.872 GHz - 10,296,904,442 instructions # 2.36 insn per cycle - 1.532957385 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.761348e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.875289e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.875289e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.135983 sec - 4,101,318,849 cycles # 1.917 GHz - 5,843,401,136 instructions # 1.42 insn per cycle - 2.151041040 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 7a80a6327c..4aab83e9a1 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,241 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_03:14:59 +DATE: 2024-03-03_14:53:26 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.634181e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.802665e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.802665e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.494713 sec - 2,059,588,733 cycles # 2.926 GHz - 3,067,379,574 instructions # 1.49 insn per cycle - 0.764554853 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.490310e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.018073e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.018073e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.564985 sec + 1,665,296,767 cycles:u # 2.836 GHz (74.91%) + 10,273,236 stalled-cycles-frontend:u # 0.62% frontend cycles idle (75.51%) + 268,134,122 stalled-cycles-backend:u # 16.10% backend cycles idle (75.61%) + 1,999,752,314 instructions:u # 1.20 insn per cycle + # 0.13 stalled cycles per insn (75.59%) + 0.610025344 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.715023e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.440232e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.440232e+06 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.824199 sec - 3,179,114,916 cycles # 2.965 GHz - 5,069,610,946 instructions # 1.59 insn per cycle - 1.133521853 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.205511e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.674234e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.674234e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.267387 sec + 3,811,409,644 cycles:u # 2.921 GHz (75.15%) + 29,986,646 stalled-cycles-frontend:u # 0.79% frontend cycles idle (75.15%) + 859,820,788 stalled-cycles-backend:u # 22.56% backend cycles idle (74.88%) + 3,799,477,591 instructions:u # 1.00 insn per cycle + # 0.23 stalled cycles per insn (74.89%) + 1.329741015 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.525402e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.537809e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.537809e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.518056 sec - 19,750,480,394 cycles # 3.028 GHz - 59,611,727,500 instructions # 3.02 insn per cycle - 6.522447301 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.944931e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.957297e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.957297e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.591656 sec + 19,640,576,801 cycles:u # 3.498 GHz (74.93%) + 2,543,236 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) + 3,295,328,624 stalled-cycles-backend:u # 16.78% backend cycles idle (74.95%) + 57,913,028,019 instructions:u # 2.95 insn per cycle + # 0.06 stalled cycles per insn (75.07%) + 5.617003149 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.903232e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.949588e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.949588e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.370584 sec - 10,396,817,898 cycles # 3.081 GHz - 30,723,473,589 instructions # 2.96 insn per cycle - 3.375008450 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.103744e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.156017e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.156017e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.712375 sec + 9,544,285,539 cycles:u # 3.489 GHz (74.75%) + 2,602,426 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.90%) + 2,338,962,459 stalled-cycles-backend:u # 24.51% backend cycles idle (75.15%) + 29,996,079,828 instructions:u # 3.14 insn per cycle + # 0.08 stalled cycles per insn (75.15%) + 2.739150856 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.888216e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.006946e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.006946e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.685691 sec - 4,902,930,827 cycles # 2.902 GHz - 11,066,989,869 instructions # 2.26 insn per cycle - 1.690115997 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.255164e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.277054e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277054e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.333524 sec + 4,705,116,374 cycles:u # 3.469 GHz (74.77%) + 2,933,791 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.46%) + 1,535,647,412 stalled-cycles-backend:u # 32.64% backend cycles idle (74.76%) + 11,250,242,509 instructions:u # 2.39 insn per cycle + # 0.14 stalled cycles per insn (75.23%) + 1.360215835 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.103682e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.126401e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.126401e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.513774 sec - 4,402,683,305 cycles # 2.901 GHz - 10,346,890,880 instructions # 2.35 insn per cycle - 1.518250177 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.798042e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.913691e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.913691e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.132010 sec - 4,131,468,761 cycles # 1.935 GHz - 5,881,941,509 instructions # 1.42 insn per cycle - 2.136586909 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 90bf6e6455..66aaaaaf83 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:30:15 +DATE: 2024-03-03_14:06:59 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.404765e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.032804e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.048930e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.465265 sec - 2,029,896,808 cycles # 2.980 GHz - 2,854,741,238 instructions # 1.41 insn per cycle - 0.763772288 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.876910e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.024569e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.032530e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 +TOTAL : 0.397674 sec + 1,090,557,817 cycles:u # 2.613 GHz (73.44%) + 2,354,638 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.22%) + 4,877,535 stalled-cycles-backend:u # 0.45% backend cycles idle (74.90%) + 1,522,275,397 instructions:u # 1.40 insn per cycle + # 0.00 stalled cycles per insn (75.29%) + 0.445131973 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.033730e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.306062e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.322624e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.607194 sec - 2,545,937,909 cycles # 2.996 GHz - 3,826,405,631 instructions # 1.50 insn per cycle - 0.909330494 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.593750e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.812455e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.818105e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.620125 sec + 1,698,536,250 cycles:u # 2.634 GHz (75.30%) + 2,170,927 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.20%) + 5,201,162 stalled-cycles-backend:u # 0.31% backend cycles idle (75.54%) + 1,984,023,513 instructions:u # 1.17 insn per cycle + # 0.00 stalled cycles per insn (75.44%) + 0.673847841 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.602792e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.615496e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.615496e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.317260 sec - 19,445,883,412 cycles # 3.076 GHz - 58,795,735,881 instructions # 3.02 insn per cycle - 6.323702590 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.917218e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.929307e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.929307e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.639988 sec + 19,807,518,823 cycles:u # 3.498 GHz (74.99%) + 2,620,483 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 4,399,963,860 stalled-cycles-backend:u # 22.21% backend cycles idle (74.99%) + 57,755,607,483 instructions:u # 2.92 insn per cycle + # 0.08 stalled cycles per insn (74.99%) + 5.664770908 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.903926e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.950247e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.950247e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.363533 sec - 10,256,448,579 cycles # 3.046 GHz - 30,347,165,405 instructions # 2.96 insn per cycle - 3.377280590 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.014022e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.064686e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.064686e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.747663 sec + 9,670,552,206 cycles:u # 3.490 GHz (74.88%) + 2,329,087 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.88%) + 2,099,453,756 stalled-cycles-backend:u # 21.71% backend cycles idle (74.90%) + 30,377,591,174 instructions:u # 3.14 insn per cycle + # 0.07 stalled cycles per insn (75.04%) + 2.774330523 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.598787e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.768674e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.768674e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.728674 sec - 5,043,692,461 cycles # 2.911 GHz - 11,484,727,811 instructions # 2.28 insn per cycle - 1.738921569 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.199979e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.219997e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219997e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.388695 sec + 4,910,323,743 cycles:u # 3.479 GHz (74.97%) + 2,629,932 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.07%) + 1,693,812,265 stalled-cycles-backend:u # 34.49% backend cycles idle (75.07%) + 11,669,214,722 instructions:u # 2.38 insn per cycle + # 0.15 stalled cycles per insn (75.07%) + 1.415480675 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4471) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.033952e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.054066e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054066e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.607009 sec - 4,642,681,786 cycles # 2.882 GHz - 10,842,961,046 instructions # 2.34 insn per cycle - 1.618440779 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.765124e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.875111e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.875111e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.134046 sec - 4,109,311,958 cycles # 1.922 GHz - 6,106,472,133 instructions # 1.49 insn per cycle - 2.145705149 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index af4f474b65..53e89252d5 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:30:44 +DATE: 2024-03-03_14:07:23 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.308616e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.230427e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.340211e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.445727 sec - 2,001,558,197 cycles # 3.000 GHz - 2,820,746,449 instructions # 1.41 insn per cycle - 0.736568143 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.203921e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.929807e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.023161e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.415474e+04 +- 1.288238e+04 ) GeV^-2 +TOTAL : 0.328270 sec + 812,063,986 cycles:u # 2.328 GHz (73.46%) + 2,362,031 stalled-cycles-frontend:u # 0.29% frontend cycles idle (75.14%) + 4,822,306 stalled-cycles-backend:u # 0.59% backend cycles idle (75.14%) + 1,357,340,463 instructions:u # 1.67 insn per cycle + # 0.00 stalled cycles per insn (75.81%) + 0.372667349 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.061859e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.424190e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.524056e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.500107 sec - 2,158,124,631 cycles # 2.977 GHz - 3,092,829,809 instructions # 1.43 insn per cycle - 0.784432881 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.361988e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.630320e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.636311e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.619620e+05 +- 1.611328e+05 ) GeV^-2 +TOTAL : 0.443221 sec + 1,173,104,015 cycles:u # 2.521 GHz (74.19%) + 2,315,115 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.91%) + 4,226,657 stalled-cycles-backend:u # 0.36% backend cycles idle (74.98%) + 1,508,758,815 instructions:u # 1.29 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 0.490579299 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669530965212 +Relative difference = 0.0005401804983001964 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.674607e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.688116e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.688116e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.146873 sec - 19,061,096,774 cycles # 3.099 GHz - 58,958,014,215 instructions # 3.09 insn per cycle - 6.153306662 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.266922e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.282433e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.282433e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.036678 sec + 17,710,352,240 cycles:u # 3.501 GHz (74.98%) + 2,410,483 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) + 2,136,534,101 stalled-cycles-backend:u # 12.06% backend cycles idle (75.02%) + 55,283,453,729 instructions:u # 3.12 insn per cycle + # 0.04 stalled cycles per insn (75.02%) + 5.061421346 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412998e+00 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.781065e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.932207e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.932207e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.886682 sec - 5,850,782,122 cycles # 3.096 GHz - 16,695,269,066 instructions # 2.85 insn per cycle - 1.898716135 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.088894e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.106292e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.106292e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.525878 sec + 5,391,591,095 cycles:u # 3.482 GHz (74.72%) + 2,226,906 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.83%) + 1,682,078,164 stalled-cycles-backend:u # 31.20% backend cycles idle (75.06%) + 16,171,756,161 instructions:u # 3.00 insn per cycle + # 0.10 stalled cycles per insn (75.21%) + 1.551958544 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129857118325333 +Relative difference = 2.039421953066926e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.892145e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.960485e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.960485e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.886334 sec - 2,581,461,055 cycles # 2.900 GHz - 5,980,838,355 instructions # 2.32 insn per cycle - 0.901108038 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.355662e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.435482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.435482e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.716733 sec + 2,558,521,208 cycles:u # 3.462 GHz (74.93%) + 2,064,335 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.11%) + 789,090,274 stalled-cycles-backend:u # 30.84% backend cycles idle (75.11%) + 6,104,077,165 instructions:u # 2.39 insn per cycle + # 0.13 stalled cycles per insn (75.12%) + 0.742553521 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.036523e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.118274e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.118274e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.825324 sec - 2,349,134,788 cycles # 2.832 GHz - 5,603,128,082 instructions # 2.39 insn per cycle - 0.837493797 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.468368e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.511305e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.511305e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.138775 sec - 2,054,810,359 cycles # 1.798 GHz - 3,334,038,485 instructions # 1.62 insn per cycle - 1.149410848 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index f62f4c8cdf..56198cb285 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,241 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_03:15:29 +DATE: 2024-03-03_14:53:51 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.995753e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.112595e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.112595e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.451281 sec - 1,977,131,537 cycles # 2.986 GHz - 2,910,150,577 instructions # 1.47 insn per cycle - 0.718929629 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 3.278258e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.631600e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.631600e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.755516e+02 +- 2.671055e+02 ) GeV^-2 +TOTAL : 0.498007 sec + 1,466,960,248 cycles:u # 2.811 GHz (74.07%) + 11,056,858 stalled-cycles-frontend:u # 0.75% frontend cycles idle (74.97%) + 48,152,138 stalled-cycles-backend:u # 3.28% backend cycles idle (74.45%) + 1,977,966,925 instructions:u # 1.35 insn per cycle + # 0.02 stalled cycles per insn (74.03%) + 0.544490496 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.708417e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.567455e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.567455e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.637857 sec - 2,608,085,808 cycles # 2.999 GHz - 3,961,129,191 instructions # 1.52 insn per cycle - 0.928114705 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.110902e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.469331e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.469331e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.855934e+03 +- 1.791981e+03 ) GeV^-2 +TOTAL : 1.101783 sec + 3,362,274,754 cycles:u # 2.971 GHz (74.60%) + 29,414,665 stalled-cycles-frontend:u # 0.87% frontend cycles idle (74.88%) + 932,389,513 stalled-cycles-backend:u # 27.73% backend cycles idle (75.31%) + 3,400,944,853 instructions:u # 1.01 insn per cycle + # 0.27 stalled cycles per insn (75.29%) + 1.154441526 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669530965212 +Relative difference = 0.0005401804983001964 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.667614e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.681311e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.681311e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.166590 sec - 19,068,958,964 cycles # 3.091 GHz - 58,962,429,433 instructions # 3.09 insn per cycle - 6.170849448 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.270228e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.285880e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.285880e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.033945 sec + 17,685,688,410 cycles:u # 3.497 GHz (75.01%) + 2,445,054 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 2,136,376,004 stalled-cycles-backend:u # 12.08% backend cycles idle (75.01%) + 55,264,318,595 instructions:u # 3.12 insn per cycle + # 0.04 stalled cycles per insn (75.01%) + 5.059502444 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412998e+00 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.742153e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.893438e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.893438e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.898339 sec - 5,876,062,473 cycles # 3.090 GHz - 16,741,995,731 instructions # 2.85 insn per cycle - 1.902713080 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.080447e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.097594e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.097594e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.540979 sec + 5,440,621,359 cycles:u # 3.479 GHz (74.94%) + 2,245,671 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.94%) + 1,706,100,678 stalled-cycles-backend:u # 31.36% backend cycles idle (74.94%) + 16,203,790,784 instructions:u # 2.98 insn per cycle + # 0.11 stalled cycles per insn (74.94%) + 1.567488020 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129857118325333 +Relative difference = 2.039421953066926e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.880787e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.949754e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.949754e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.895765 sec - 2,600,620,319 cycles # 2.891 GHz - 6,016,590,564 instructions # 2.31 insn per cycle - 0.900189489 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.084629e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.167676e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.167676e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.810420 sec - 2,363,958,510 cycles # 2.904 GHz - 5,639,045,986 instructions # 2.39 insn per cycle - 0.814799834 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.368533e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.449245e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.449245e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.715861 sec + 2,551,325,043 cycles:u # 3.453 GHz (74.92%) + 2,001,393 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.10%) + 794,495,145 stalled-cycles-backend:u # 31.14% backend cycles idle (75.10%) + 6,130,305,492 instructions:u # 2.40 insn per cycle + # 0.13 stalled cycles per insn (75.11%) + 0.742317031 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.603454e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.652417e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.652417e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.048212 sec - 2,071,251,869 cycles # 1.970 GHz - 3,374,799,702 instructions # 1.63 insn per cycle - 1.052574627 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index b43a9401e8..6560a5a7c4 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:31:09 +DATE: 2024-03-03_14:07:45 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.359219e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.312667e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.422625e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.446885 sec - 1,972,174,797 cycles # 2.962 GHz - 2,746,314,290 instructions # 1.39 insn per cycle - 0.738224654 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.196681e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.928788e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.036655e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.415474e+04 +- 1.288238e+04 ) GeV^-2 +TOTAL : 0.329154 sec + 835,208,762 cycles:u # 2.395 GHz (73.30%) + 2,288,881 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.84%) + 5,214,532 stalled-cycles-backend:u # 0.62% backend cycles idle (74.91%) + 1,324,908,753 instructions:u # 1.59 insn per cycle + # 0.00 stalled cycles per insn (75.74%) + 0.373210994 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.060800e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.419962e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.520064e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.497273 sec - 2,176,246,033 cycles # 3.004 GHz - 3,133,180,341 instructions # 1.44 insn per cycle - 0.782102946 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.426810e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.693888e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.700335e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.619620e+05 +- 1.611328e+05 ) GeV^-2 +TOTAL : 0.443145 sec + 1,181,226,145 cycles:u # 2.537 GHz (75.19%) + 2,204,442 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.91%) + 5,622,285 stalled-cycles-backend:u # 0.48% backend cycles idle (74.91%) + 1,584,875,011 instructions:u # 1.34 insn per cycle + # 0.00 stalled cycles per insn (75.83%) + 0.490414289 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669531526541 +Relative difference = 0.0005401805380429868 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.676079e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.689805e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.689805e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.143350 sec - 18,995,848,931 cycles # 3.090 GHz - 58,700,265,502 instructions # 3.09 insn per cycle - 6.150073952 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.251763e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.267144e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.267144e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.059826 sec + 17,789,469,969 cycles:u # 3.501 GHz (74.98%) + 2,674,859 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.98%) + 3,105,019,101 stalled-cycles-backend:u # 17.45% backend cycles idle (74.97%) + 55,027,375,905 instructions:u # 3.09 insn per cycle + # 0.06 stalled cycles per insn (74.97%) + 5.084642260 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412998e+00 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.180884e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.346917e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.346917e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.804269 sec - 5,584,642,506 cycles # 3.088 GHz - 16,510,962,038 instructions # 2.96 insn per cycle - 1.819572816 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.124434e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.143020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.143020e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.478176 sec + 5,232,431,526 cycles:u # 3.487 GHz (75.03%) + 2,092,634 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.95%) + 1,443,047,898 stalled-cycles-backend:u # 27.58% backend cycles idle (74.95%) + 16,249,178,116 instructions:u # 3.11 insn per cycle + # 0.09 stalled cycles per insn (74.95%) + 1.504828895 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129857712652836 +Relative difference = 1.618803841657786e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634306e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.685973e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.685973e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.022630 sec - 2,975,513,176 cycles # 2.898 GHz - 6,634,498,276 instructions # 2.23 insn per cycle - 1.034400565 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.127769e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.192769e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.192769e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.791024 sec + 2,813,900,368 cycles:u # 3.459 GHz (74.45%) + 2,924,467 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.60%) + 844,743,001 stalled-cycles-backend:u # 30.02% backend cycles idle (75.08%) + 6,732,540,107 instructions:u # 2.39 insn per cycle + # 0.13 stalled cycles per insn (75.43%) + 0.817224988 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5412) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.769784e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.829611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.829611e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.945795 sec - 2,752,522,160 cycles # 2.898 GHz - 6,256,039,450 instructions # 2.27 insn per cycle - 0.961442115 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.392018e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.430701e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.430701e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.200320 sec - 2,230,572,619 cycles # 1.852 GHz - 3,698,329,997 instructions # 1.66 insn per cycle - 1.213663484 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2378) (512y: 29) (512z: 3963) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 568d6c4513..266ca660a0 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:31:34 +DATE: 2024-03-03_14:08:06 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.426575e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.039569e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.055629e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.463709 sec - 2,071,639,040 cycles # 3.004 GHz - 2,941,031,538 instructions # 1.42 insn per cycle - 0.764842159 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.902026e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.051350e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.057968e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 +TOTAL : 0.394057 sec + 1,072,493,406 cycles:u # 2.584 GHz (73.70%) + 2,358,166 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.91%) + 5,190,578 stalled-cycles-backend:u # 0.48% backend cycles idle (75.17%) + 1,527,179,402 instructions:u # 1.42 insn per cycle + # 0.00 stalled cycles per insn (75.92%) + 0.441197229 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.035948e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.309187e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325703e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.608855 sec - 2,552,084,280 cycles # 3.004 GHz - 3,794,047,088 instructions # 1.49 insn per cycle - 0.909216297 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.641496e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.847092e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.852228e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.621271 sec + 1,710,114,837 cycles:u # 2.637 GHz (74.54%) + 2,243,098 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.24%) + 5,004,865 stalled-cycles-backend:u # 0.29% backend cycles idle (75.41%) + 2,017,082,158 instructions:u # 1.18 insn per cycle + # 0.00 stalled cycles per insn (75.36%) + 0.674174885 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/CUDA) = 1.4131213755569483 +Relative difference = 4.4188898885662695e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.546543e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.558753e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.558753e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.456566 sec - 20,000,355,725 cycles # 3.096 GHz - 60,532,425,335 instructions # 3.03 insn per cycle - 6.462989015 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.889373e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.901249e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.901249e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.694274 sec + 19,995,566,723 cycles:u # 3.498 GHz (74.95%) + 2,721,102 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 4,191,817,537 stalled-cycles-backend:u # 20.96% backend cycles idle (74.96%) + 59,211,742,429 instructions:u # 2.96 insn per cycle + # 0.07 stalled cycles per insn (75.00%) + 5.719077736 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.015629e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.062224e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.062224e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.288178 sec - 10,191,043,016 cycles # 3.096 GHz - 30,384,591,666 instructions # 2.98 insn per cycle - 3.302408299 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.112243e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.164517e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.164517e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.704115 sec + 9,537,760,029 cycles:u # 3.498 GHz (74.86%) + 2,331,362 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.06%) + 2,352,954,265 stalled-cycles-backend:u # 24.67% backend cycles idle (75.06%) + 29,754,943,887 instructions:u # 3.12 insn per cycle + # 0.08 stalled cycles per insn (75.07%) + 2.730308483 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.844182e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.002719e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.002719e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.686926 sec - 4,874,678,301 cycles # 2.883 GHz - 10,979,160,826 instructions # 2.25 insn per cycle - 1.698730583 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.262207e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.284404e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.284404e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.321482 sec + 4,683,498,847 cycles:u # 3.484 GHz (74.95%) + 2,266,589 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) + 1,546,576,102 stalled-cycles-backend:u # 33.02% backend cycles idle (75.01%) + 11,204,209,244 instructions:u # 2.39 insn per cycle + # 0.14 stalled cycles per insn (75.01%) + 1.347795806 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4563) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.132241e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155783e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.155783e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.469271 sec - 4,278,421,569 cycles # 2.904 GHz - 10,248,685,624 instructions # 2.40 insn per cycle - 1.480280367 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.587751e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.694540e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.694540e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.183850 sec - 4,204,822,902 cycles # 1.923 GHz - 6,044,506,630 instructions # 1.44 insn per cycle - 2.192719745 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 2001d2a062..ed7f390279 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:32:03 +DATE: 2024-03-03_14:08:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.409979e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.033107e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.049247e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.461655 sec - 2,079,301,655 cycles # 3.013 GHz - 2,945,288,445 instructions # 1.42 insn per cycle - 0.761228896 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.898925e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.041591e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.050657e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 +TOTAL : 0.392553 sec + 1,067,884,395 cycles:u # 2.579 GHz (73.45%) + 2,397,019 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.29%) + 5,343,057 stalled-cycles-backend:u # 0.50% backend cycles idle (74.99%) + 1,520,090,207 instructions:u # 1.42 insn per cycle + # 0.00 stalled cycles per insn (75.71%) + 0.437411352 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.037338e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.304237e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.318241e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.603998 sec - 2,550,056,991 cycles # 3.016 GHz - 3,770,712,997 instructions # 1.48 insn per cycle - 0.905342631 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.627682e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.835274e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.840980e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 +TOTAL : 0.615989 sec + 1,701,218,541 cycles:u # 2.643 GHz (75.08%) + 2,280,823 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.16%) + 5,233,329 stalled-cycles-backend:u # 0.31% backend cycles idle (75.52%) + 1,993,728,288 instructions:u # 1.17 insn per cycle + # 0.00 stalled cycles per insn (75.43%) + 0.670239749 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/CUDA) = 1.4131213755569483 +Relative difference = 4.4188898885662695e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.536387e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.548597e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.548597e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.482109 sec - 19,897,203,281 cycles # 3.068 GHz - 59,934,079,759 instructions # 3.01 insn per cycle - 6.488470935 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.902767e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.914803e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.914803e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.668045 sec + 19,920,122,262 cycles:u # 3.500 GHz (74.98%) + 2,563,039 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 4,140,843,564 stalled-cycles-backend:u # 20.79% backend cycles idle (74.98%) + 58,705,733,304 instructions:u # 2.95 insn per cycle + # 0.07 stalled cycles per insn (74.98%) + 5.693347934 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.079933e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.127366e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.127366e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.246582 sec - 10,068,513,741 cycles # 3.097 GHz - 30,097,905,174 instructions # 2.99 insn per cycle - 3.264343936 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.199590e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.254972e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.254972e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.666534 sec + 9,395,499,000 cycles:u # 3.494 GHz (74.98%) + 2,544,995 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.01%) + 2,073,809,606 stalled-cycles-backend:u # 22.07% backend cycles idle (75.01%) + 30,156,482,499 instructions:u # 3.21 insn per cycle + # 0.07 stalled cycles per insn (75.02%) + 2.693096800 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.599229e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.768469e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.768469e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.728964 sec - 5,016,079,762 cycles # 2.895 GHz - 11,483,054,886 instructions # 2.29 insn per cycle - 1.742427809 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.217824e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.238455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.238455e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.368644 sec + 4,832,692,381 cycles:u # 3.473 GHz (74.71%) + 2,500,289 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.75%) + 1,555,804,345 stalled-cycles-backend:u # 32.19% backend cycles idle (74.93%) + 11,688,372,170 instructions:u # 2.42 insn per cycle + # 0.13 stalled cycles per insn (75.18%) + 1.395801996 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.051243e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.071758e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.071758e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.580395 sec - 4,590,869,899 cycles # 2.898 GHz - 10,811,034,467 instructions # 2.35 insn per cycle - 1.596114627 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.586932e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.694563e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.694563e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.184061 sec - 4,216,157,602 cycles # 1.927 GHz - 6,273,944,868 instructions # 1.49 insn per cycle - 2.195028764 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index c4f627d4b9..c60f4bd576 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:32:32 +DATE: 2024-03-03_14:08:55 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.456101e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.489020e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.491439e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.526891 sec - 2,312,216,646 cycles # 3.007 GHz - 3,538,385,257 instructions # 1.53 insn per cycle - 0.841955777 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.443213e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.592458e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.593832e+04 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.658516 sec + 1,915,656,349 cycles:u # 2.930 GHz (74.58%) + 2,322,198 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.44%) + 5,431,561 stalled-cycles-backend:u # 0.28% backend cycles idle (74.96%) + 2,119,719,157 instructions:u # 1.11 insn per cycle + # 0.00 stalled cycles per insn (76.05%) + 0.704615889 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.122556e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.158071e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.159487e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.037875 sec - 10,086,152,870 cycles # 3.059 GHz - 22,511,661,776 instructions # 2.23 insn per cycle - 3.352868148 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.243645e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.246133e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.246191e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 7.700449 sec + 26,524,218,435 cycles:u # 3.432 GHz (74.95%) + 3,249,930 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 4,782,652 stalled-cycles-backend:u # 0.02% backend cycles idle (75.06%) + 21,094,095,538 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 7.758120507 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.962967e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.963888e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.963888e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.365178 sec - 25,629,682,297 cycles # 3.063 GHz - 78,935,463,104 instructions # 3.08 insn per cycle - 8.371779038 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.232576e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.233464e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.233464e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.363302 sec + 25,824,388,635 cycles:u # 3.501 GHz (74.95%) + 1,475,547 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 3,692,253,610 stalled-cycles-backend:u # 14.30% backend cycles idle (74.96%) + 81,773,513,781 instructions:u # 3.17 insn per cycle + # 0.05 stalled cycles per insn (75.02%) + 7.388142336 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.775994e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.779313e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.779313e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.352554 sec - 12,920,825,541 cycles # 2.966 GHz - 39,280,019,197 instructions # 3.04 insn per cycle - 4.370436126 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.049918e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.054493e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.054493e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.255362 sec + 11,447,299,246 cycles:u # 3.492 GHz (74.89%) + 1,095,573 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 1,370,623,003 stalled-cycles-backend:u # 11.97% backend cycles idle (75.07%) + 39,243,650,589 instructions:u # 3.43 insn per cycle + # 0.03 stalled cycles per insn (75.11%) + 3.281919712 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.587371e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.605210e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.605210e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.920439 sec - 5,577,220,412 cycles # 2.899 GHz - 13,686,699,383 instructions # 2.45 insn per cycle - 1.933532640 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.209079e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.211700e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.211700e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.364169 sec + 4,834,989,119 cycles:u # 3.486 GHz (74.79%) + 724,221 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.20%) + 559,585,514 stalled-cycles-backend:u # 11.57% backend cycles idle (75.20%) + 13,846,879,661 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (74.92%) + 1.390361830 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.660129e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.682450e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.682450e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.708010 sec - 4,898,677,790 cycles # 2.863 GHz - 12,341,670,637 instructions # 2.52 insn per cycle - 1.722166284 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.531084e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.544719e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.544719e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.187284 sec - 4,109,191,778 cycles # 1.875 GHz - 6,335,550,253 instructions # 1.54 insn per cycle - 2.200752564 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 8d1778e673..fc9bbc7387 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,241 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:16:28 +DATE: 2024-03-03_14:54:42 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.142985e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.469804e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.469804e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.511155 sec - 2,228,194,908 cycles # 3.016 GHz - 3,541,287,827 instructions # 1.59 insn per cycle - 0.799045956 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.380234e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.523976e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.523976e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.660049 sec + 1,987,141,382 cycles:u # 2.923 GHz (74.61%) + 2,974,543 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.40%) + 34,864,776 stalled-cycles-backend:u # 1.75% backend cycles idle (74.77%) + 2,225,515,652 instructions:u # 1.12 insn per cycle + # 0.02 stalled cycles per insn (74.91%) + 0.705064086 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.621948e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.093950e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.093950e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.305480 sec - 10,998,775,521 cycles # 3.077 GHz - 24,493,841,360 instructions # 2.23 insn per cycle - 3.633710964 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.209603e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.244411e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.244411e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.550769 sec + 29,278,144,313 cycles:u # 3.408 GHz (75.04%) + 22,139,444 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.05%) + 1,142,582,420 stalled-cycles-backend:u # 3.90% backend cycles idle (75.04%) + 23,517,292,837 instructions:u # 0.80 insn per cycle + # 0.05 stalled cycles per insn (75.05%) + 8.613741102 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.956691e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.957671e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.957671e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.395628 sec - 25,661,453,890 cycles # 3.059 GHz - 78,946,626,848 instructions # 3.08 insn per cycle - 8.400144517 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.203000e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.203872e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.203872e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.455869 sec + 26,168,247,565 cycles:u # 3.499 GHz (74.97%) + 34,946,009 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.97%) + 3,763,223,846 stalled-cycles-backend:u # 14.38% backend cycles idle (74.97%) + 81,775,845,502 instructions:u # 3.13 insn per cycle + # 0.05 stalled cycles per insn (74.98%) + 7.481363596 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.779486e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.783121e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.783121e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.352704 sec - 12,939,532,043 cycles # 2.970 GHz - 39,292,271,047 instructions # 3.04 insn per cycle - 4.357352756 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.060165e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.064745e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.064745e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.252571 sec + 11,437,798,335 cycles:u # 3.491 GHz (74.87%) + 1,096,110 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 1,362,677,177 stalled-cycles-backend:u # 11.91% backend cycles idle (75.08%) + 39,244,727,057 instructions:u # 3.43 insn per cycle + # 0.03 stalled cycles per insn (75.10%) + 3.279634839 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.560149e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.578951e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.578951e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.929060 sec - 5,589,750,479 cycles # 2.892 GHz - 13,696,577,373 instructions # 2.45 insn per cycle - 1.933630865 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.209381e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.212036e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.212036e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.367520 sec + 4,809,725,861 cycles:u # 3.458 GHz (74.73%) + 786,688 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.80%) + 534,931,736 stalled-cycles-backend:u # 11.12% backend cycles idle (75.06%) + 13,801,628,982 instructions:u # 2.87 insn per cycle + # 0.04 stalled cycles per insn (75.27%) + 1.394470657 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.749338e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.772565e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.772565e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.695619 sec - 4,910,055,408 cycles # 2.889 GHz - 12,351,492,799 instructions # 2.52 insn per cycle - 1.700097015 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.621116e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.636094e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.636094e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.165843 sec - 4,123,850,554 cycles # 1.901 GHz - 6,345,407,560 instructions # 1.54 insn per cycle - 2.170297070 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 597fd5665a..c215267a23 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,224 +1,182 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:28:08 +DATE: 2024-03-03_15:04:56 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.502974e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.532224e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.534544e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.385233e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.605323e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.606526e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.505991 sec - 2,242,092,583 cycles # 3.014 GHz - 3,466,791,908 instructions # 1.55 insn per cycle - 0.811853126 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.649809 sec + 1,951,525,535 cycles:u # 2.906 GHz (74.92%) + 2,497,767 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.10%) + 34,093,522 stalled-cycles-backend:u # 1.75% backend cycles idle (75.36%) + 2,162,495,237 instructions:u # 1.11 insn per cycle + # 0.02 stalled cycles per insn (75.19%) + 0.693235115 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.137461e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.171030e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.172456e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.243555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.246594e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.246652e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.124130 sec - 10,356,034,147 cycles # 3.069 GHz - 23,417,816,833 instructions # 2.26 insn per cycle - 3.433693053 seconds time elapsed +TOTAL : 8.390908 sec + 28,908,247,505 cycles:u # 3.432 GHz (74.92%) + 11,922,008 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.98%) + 1,149,312,271 stalled-cycles-backend:u # 3.98% backend cycles idle (75.03%) + 22,694,060,209 instructions:u # 0.79 insn per cycle + # 0.05 stalled cycles per insn (75.03%) + 8.446085700 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.957351e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.958278e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.958278e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.203553e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204441e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204441e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.389537 sec - 25,646,805,438 cycles # 3.056 GHz - 78,935,262,340 instructions # 3.08 insn per cycle - 8.393631651 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.450336 sec + 26,160,134,176 cycles:u # 3.501 GHz (74.95%) + 35,129,020 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.92%) + 3,765,184,530 stalled-cycles-backend:u # 14.39% backend cycles idle (74.97%) + 81,743,104,429 instructions:u # 3.12 insn per cycle + # 0.05 stalled cycles per insn (75.06%) + 7.474754501 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.762997e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.766514e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.766514e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.036858e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.041413e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.041413e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.369422 sec - 12,916,153,129 cycles # 2.954 GHz - 39,278,867,860 instructions # 3.04 insn per cycle - 4.373667823 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.263765 sec + 11,479,314,951 cycles:u # 3.493 GHz (74.93%) + 1,078,236 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.93%) + 1,374,719,385 stalled-cycles-backend:u # 11.98% backend cycles idle (74.93%) + 39,301,165,601 instructions:u # 3.42 insn per cycle + # 0.03 stalled cycles per insn (74.95%) + 3.288037777 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.528032e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.546362e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.546362e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.210717e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.213341e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.213341e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.933878 sec - 5,580,678,683 cycles # 2.881 GHz - 13,684,529,284 instructions # 2.45 insn per cycle - 1.937965494 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +TOTAL : 1.362358 sec + 4,821,236,803 cycles:u # 3.482 GHz (74.69%) + 1,484,452 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.16%) + 494,259,401 stalled-cycles-backend:u # 10.25% backend cycles idle (75.16%) + 13,804,004,919 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (75.16%) + 1.386492812 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.723484e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.746463e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.746463e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.697628 sec - 4,903,453,092 cycles # 2.882 GHz - 12,338,806,795 instructions # 2.52 insn per cycle - 1.701856837 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.314965e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.328200e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.328200e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.253262 sec - 4,111,107,725 cycles # 1.822 GHz - 6,332,329,650 instructions # 1.54 insn per cycle - 2.257544828 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index de32359ede..fd095032ba 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,227 +1,187 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:21:36 +DATE: 2024-03-03_15:01:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.198300e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.499375e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.501597e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.508517 sec - 2,246,531,629 cycles # 3.011 GHz - 3,559,465,442 instructions # 1.58 insn per cycle - 0.806328345 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.448083e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.580114e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.581811e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.648683 sec + 1,955,873,857 cycles:u # 2.904 GHz (75.51%) + 3,004,987 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.06%) + 33,884,000 stalled-cycles-backend:u # 1.73% backend cycles idle (75.16%) + 2,161,496,629 instructions:u # 1.11 insn per cycle + # 0.02 stalled cycles per insn (75.54%) + 0.692984705 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.741268e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.175443e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.176848e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.195111 sec - 10,565,694,760 cycles # 3.061 GHz - 24,272,327,456 instructions # 2.30 insn per cycle - 3.508790742 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.209371e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.241273e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.241330e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.510530 sec + 29,321,272,330 cycles:u # 3.430 GHz (75.00%) + 22,499,031 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.01%) + 1,140,594,084 stalled-cycles-backend:u # 3.89% backend cycles idle (75.03%) + 23,505,210,458 instructions:u # 0.80 insn per cycle + # 0.05 stalled cycles per insn (75.02%) + 8.565646128 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.950947e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.951893e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.951893e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.415718 sec - 25,630,796,247 cycles # 3.044 GHz - 78,935,144,677 instructions # 3.08 insn per cycle - 8.419920398 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.196828e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.197708e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.197708e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.473100 sec + 26,252,974,565 cycles:u # 3.503 GHz (74.96%) + 42,014,245 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.00%) + 3,761,392,018 stalled-cycles-backend:u # 14.33% backend cycles idle (75.03%) + 81,736,635,956 instructions:u # 3.11 insn per cycle + # 0.05 stalled cycles per insn (75.03%) + 7.497564777 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.749651e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.752979e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.752979e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.383944 sec - 12,941,364,841 cycles # 2.950 GHz - 39,279,009,350 instructions # 3.04 insn per cycle - 4.388336169 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.059285e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.063808e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.063808e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.249304 sec + 11,441,253,986 cycles:u # 3.497 GHz (74.91%) + 1,103,861 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 1,360,429,247 stalled-cycles-backend:u # 11.89% backend cycles idle (75.06%) + 39,250,559,070 instructions:u # 3.43 insn per cycle + # 0.03 stalled cycles per insn (75.06%) + 3.273415731 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.444820e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.462277e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.462277e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.951803 sec - 5,576,482,664 cycles # 2.852 GHz - 13,685,505,947 instructions # 2.45 insn per cycle - 1.956019187 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.211457e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.214083e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.214083e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.361522 sec + 4,818,596,084 cycles:u # 3.482 GHz (74.77%) + 724,451 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.01%) + 509,043,737 stalled-cycles-backend:u # 10.56% backend cycles idle (75.15%) + 13,808,567,781 instructions:u # 2.87 insn per cycle + # 0.04 stalled cycles per insn (75.15%) + 1.385716850 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.751887e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.775334e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.775334e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.690955 sec - 4,892,330,509 cycles # 2.888 GHz - 12,340,572,549 instructions # 2.52 insn per cycle - 1.695111197 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.643060e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.657306e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.657306e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.155567 sec - 4,105,793,778 cycles # 1.902 GHz - 6,333,858,387 instructions # 1.54 insn per cycle - 2.159935327 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 836b2fd223..b4e3b36de1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:33:08 +DATE: 2024-03-03_14:09:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.456815e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.489621e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.492178e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.523446 sec - 2,259,779,898 cycles # 2.994 GHz - 3,514,783,609 instructions # 1.56 insn per cycle - 0.830655921 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.386128e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.441368e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.441866e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.520173 sec + 1,518,418,478 cycles:u # 2.822 GHz (74.19%) + 2,284,692 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.45%) + 5,557,571 stalled-cycles-backend:u # 0.37% backend cycles idle (74.52%) + 1,877,465,075 instructions:u # 1.24 insn per cycle + # 0.00 stalled cycles per insn (74.75%) + 0.576564042 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.127813e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.161921e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.163304e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.027147 sec - 10,102,095,677 cycles # 3.066 GHz - 22,774,733,235 instructions # 2.25 insn per cycle - 3.352533111 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.739084e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.744120e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.744244e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 6.344064 sec + 21,795,921,727 cycles:u # 3.421 GHz (74.89%) + 3,060,139 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.93%) + 5,868,247 stalled-cycles-backend:u # 0.03% backend cycles idle (75.02%) + 17,480,500,624 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 6.398703650 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.968945e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.969930e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.969930e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.342362 sec - 25,562,894,530 cycles # 3.064 GHz - 78,707,498,900 instructions # 3.08 insn per cycle - 8.350709191 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.227750e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.228660e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.228660e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.369278 sec + 25,879,528,720 cycles:u # 3.501 GHz (75.00%) + 6,710,952 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.00%) + 3,434,328,529 stalled-cycles-backend:u # 13.27% backend cycles idle (75.00%) + 81,766,955,675 instructions:u # 3.16 insn per cycle + # 0.04 stalled cycles per insn (75.00%) + 7.393999483 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.758058e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.761397e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.761397e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.374701 sec - 12,919,245,066 cycles # 2.951 GHz - 39,226,355,054 instructions # 3.04 insn per cycle - 4.387657418 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.031721e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.036221e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.036221e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.266876 sec + 11,494,978,612 cycles:u # 3.494 GHz (74.95%) + 1,901,440 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) + 1,517,805,835 stalled-cycles-backend:u # 13.20% backend cycles idle (74.95%) + 39,254,997,853 instructions:u # 3.41 insn per cycle + # 0.04 stalled cycles per insn (74.96%) + 3.293275469 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.289947e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.307265e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.307265e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.987975 sec - 5,629,143,308 cycles # 2.825 GHz - 13,800,788,871 instructions # 2.45 insn per cycle - 1.999251955 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.207772e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.210381e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.210381e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.365420 sec + 4,828,463,830 cycles:u # 3.478 GHz (74.71%) + 750,092 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.69%) + 598,745,360 stalled-cycles-backend:u # 12.40% backend cycles idle (74.93%) + 13,824,866,165 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (75.18%) + 1.391810341 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11030) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.607973e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.629961e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.629961e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.716692 sec - 4,942,228,477 cycles # 2.873 GHz - 12,466,581,724 instructions # 2.52 insn per cycle - 1.728222884 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.633414e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.646913e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.646913e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.159145 sec - 4,117,977,410 cycles # 1.904 GHz - 6,458,802,297 instructions # 1.57 insn per cycle - 2.172057894 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 5cb26f1dc5..305b2aa7ff 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:05:40 +DATE: 2024-03-03_14:35:23 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.234238e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.262824e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.264818e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.529504 sec - 2,311,611,520 cycles # 3.006 GHz - 3,548,053,349 instructions # 1.53 insn per cycle - 0.826491750 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.406410e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.571548e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.572717e+04 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.638815 sec + 1,900,407,247 cycles:u # 2.901 GHz (75.09%) + 2,254,354 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.58%) + 5,396,830 stalled-cycles-backend:u # 0.28% backend cycles idle (75.60%) + 2,102,115,004 instructions:u # 1.11 insn per cycle + # 0.00 stalled cycles per insn (75.59%) + 0.685261289 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.771596e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.800183e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.801376e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.298192 sec - 10,832,117,508 cycles # 3.051 GHz - 23,123,371,744 instructions # 2.13 insn per cycle - 3.609870208 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.243278e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.245902e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.245963e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 7.699819 sec + 26,542,369,018 cycles:u # 3.434 GHz (74.95%) + 3,288,397 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 4,842,522 stalled-cycles-backend:u # 0.02% backend cycles idle (75.06%) + 21,115,129,397 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 7.757988929 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.437828e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.438319e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.438319e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 36.966049 sec - 113,615,073,618 cycles # 3.074 GHz - 144,968,095,911 instructions # 1.28 insn per cycle - 36.970400514 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21301) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.571026e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.571408e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.571408e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 35.886395 sec + 125,842,685,634 cycles:u # 3.505 GHz (74.99%) + 13,532,780 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 15,368,715,731 stalled-cycles-backend:u # 12.21% backend cycles idle (75.00%) + 141,501,900,955 instructions:u # 1.12 insn per cycle + # 0.11 stalled cycles per insn (75.00%) + 35.911647079 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140450E-004 -Relative difference = 2.83729918072716e-07 +Avg ME (F77/C++) = 6.6266731198140461E-004 +Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.281454e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.284254e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.284254e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.007790 sec - 14,730,075,423 cycles # 2.939 GHz - 37,574,123,368 instructions # 2.55 insn per cycle - 5.012256986 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.508609e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.510771e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.510771e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 4.682382 sec + 16,441,765,172 cycles:u # 3.495 GHz (75.01%) + 11,758,570 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.01%) + 6,326,199,000 stalled-cycles-backend:u # 38.48% backend cycles idle (75.01%) + 37,548,136,278 instructions:u # 2.28 insn per cycle + # 0.17 stalled cycles per insn (75.01%) + 4.708772467 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141209E-004 -Relative difference = 2.8372990661989057e-07 +Avg ME (F77/C++) = 6.6266731198141220E-004 +Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.743950e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.758262e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.758262e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.127650 sec - 6,163,100,705 cycles # 2.892 GHz - 13,061,449,928 instructions # 2.12 insn per cycle - 2.132187716 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.553279e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.563561e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.563561e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.179424 sec + 7,678,063,051 cycles:u # 3.486 GHz (74.95%) + 742,936 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) + 4,343,621,516 stalled-cycles-backend:u # 56.57% backend cycles idle (74.94%) + 12,960,132,017 instructions:u # 1.69 insn per cycle + # 0.34 stalled cycles per insn (74.94%) + 2.206216616 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.460039e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.482215e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.482215e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.743142 sec - 5,059,957,423 cycles # 2.897 GHz - 11,440,000,239 instructions # 2.26 insn per cycle - 1.747501406 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.938377e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.953416e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.953416e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.075865 sec - 3,979,244,183 cycles # 1.914 GHz - 5,942,139,795 instructions # 1.49 insn per cycle - 2.080305520 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index afca4b7953..3f2290590f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:06:48 +DATE: 2024-03-03_14:36:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.244633e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.273686e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.275983e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.531287 sec - 2,311,991,159 cycles # 3.015 GHz - 3,584,221,599 instructions # 1.55 insn per cycle - 0.825938734 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.381730e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.434033e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.434455e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.520350 sec + 1,520,729,660 cycles:u # 2.822 GHz (73.14%) + 2,385,310 stalled-cycles-frontend:u # 0.16% frontend cycles idle (73.56%) + 5,252,011 stalled-cycles-backend:u # 0.35% backend cycles idle (74.78%) + 1,856,224,661 instructions:u # 1.22 insn per cycle + # 0.00 stalled cycles per insn (74.81%) + 0.565229996 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.793538e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.821908e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.823116e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.269849 sec - 10,805,743,512 cycles # 3.068 GHz - 25,084,175,459 instructions # 2.32 insn per cycle - 3.579404730 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.738254e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.743407e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.743529e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 6.346686 sec + 21,768,032,455 cycles:u # 3.414 GHz (74.98%) + 2,929,134 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 4,917,671 stalled-cycles-backend:u # 0.02% backend cycles idle (75.08%) + 17,435,737,061 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (75.09%) + 6.404044795 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.412070e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.412546e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.412546e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.253529 sec - 114,121,742,420 cycles # 3.069 GHz - 145,689,073,244 instructions # 1.28 insn per cycle - 37.257693750 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:22559) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.526933e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.527306e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.527306e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 36.235634 sec + 127,055,211,551 cycles:u # 3.504 GHz (75.00%) + 16,943,775 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 15,460,991,830 stalled-cycles-backend:u # 12.17% backend cycles idle (75.00%) + 141,672,992,120 instructions:u # 1.12 insn per cycle + # 0.11 stalled cycles per insn (75.00%) + 36.260772993 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21831) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140450E-004 -Relative difference = 2.83729918072716e-07 +Avg ME (F77/C++) = 6.6266731198140461E-004 +Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.198627e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.201180e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.201180e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.136766 sec - 15,152,451,249 cycles # 2.948 GHz - 37,761,291,325 instructions # 2.49 insn per cycle - 5.141156615 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.565168e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.567504e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.567504e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 4.608055 sec + 16,189,817,583 cycles:u # 3.496 GHz (74.95%) + 4,727,401 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.95%) + 6,417,413,717 stalled-cycles-backend:u # 39.64% backend cycles idle (74.96%) + 37,648,163,335 instructions:u # 2.33 insn per cycle + # 0.17 stalled cycles per insn (74.98%) + 4.634664036 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141209E-004 -Relative difference = 2.8372990661989057e-07 +Avg ME (F77/C++) = 6.6266731198141220E-004 +Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.950126e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.965335e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.965335e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.072422 sec - 6,013,210,013 cycles # 2.896 GHz - 12,895,807,400 instructions # 2.14 insn per cycle - 2.076740513 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.770607e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.781417e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.781417e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.118504 sec + 7,454,241,266 cycles:u # 3.481 GHz (74.97%) + 787,228 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 4,288,175,252 stalled-cycles-backend:u # 57.53% backend cycles idle (74.97%) + 12,851,745,046 instructions:u # 1.72 insn per cycle + # 0.33 stalled cycles per insn (74.97%) + 2.144706465 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45645) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.394633e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.416357e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.416357e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.755119 sec - 5,091,337,522 cycles # 2.895 GHz - 11,446,622,503 instructions # 2.25 insn per cycle - 1.759562583 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.001850e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.017431e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.017431e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.059473 sec - 3,944,538,203 cycles # 1.912 GHz - 5,896,184,476 instructions # 1.49 insn per cycle - 2.063940696 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 082176c355..667b1207c3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:33:45 +DATE: 2024-03-03_14:10:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.331619e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.392833e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.401451e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.481440 sec - 2,077,514,231 cycles # 2.979 GHz - 3,093,505,744 instructions # 1.49 insn per cycle - 0.777796663 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.559555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.765298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.766948e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 +TOTAL : 0.426954 sec + 1,155,934,779 cycles:u # 2.607 GHz (75.85%) + 2,270,453 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.69%) + 5,599,816 stalled-cycles-backend:u # 0.48% backend cycles idle (75.28%) + 1,588,765,906 instructions:u # 1.37 insn per cycle + # 0.00 stalled cycles per insn (74.63%) + 0.473207237 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.622317e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.697439e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.700567e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.713365 sec - 5,944,272,538 cycles # 3.053 GHz - 12,632,277,461 instructions # 2.13 insn per cycle - 2.004079656 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.702651e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.730089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.730609e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 +TOTAL : 2.615281 sec + 8,762,195,591 cycles:u # 3.319 GHz (74.91%) + 2,598,778 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.13%) + 5,515,649 stalled-cycles-backend:u # 0.06% backend cycles idle (75.03%) + 7,373,798,662 instructions:u # 0.84 insn per cycle + # 0.00 stalled cycles per insn (74.97%) + 2.666169145 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.049682e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.050694e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.050694e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.010109 sec - 24,614,432,061 cycles # 3.072 GHz - 78,126,558,251 instructions # 3.17 insn per cycle - 8.016891762 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.471402e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.472463e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.472463e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.642044 sec + 23,349,584,089 cycles:u # 3.504 GHz (74.93%) + 1,322,457 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 2,539,618,789 stalled-cycles-backend:u # 10.88% backend cycles idle (75.01%) + 75,891,173,645 instructions:u # 3.25 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 6.666703631 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.386833e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.400650e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.400650e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.228676 sec - 6,461,822,382 cycles # 2.894 GHz - 20,120,855,558 instructions # 3.11 insn per cycle - 2.241648353 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.951991e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.970075e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.970075e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.654004 sec + 5,841,823,067 cycles:u # 3.485 GHz (74.75%) + 702,558 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 854,737,495 stalled-cycles-backend:u # 14.63% backend cycles idle (75.19%) + 20,134,073,878 instructions:u # 3.45 insn per cycle + # 0.04 stalled cycles per insn (75.19%) + 1.680033246 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.671811e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.678370e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.678370e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.990019 sec - 2,821,251,649 cycles # 2.839 GHz - 6,989,221,748 instructions # 2.48 insn per cycle - 1.002444816 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.387538e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.397939e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.397939e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.692996 sec + 2,470,623,307 cycles:u # 3.453 GHz (74.55%) + 588,565 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.07%) + 257,432,909 stalled-cycles-backend:u # 10.42% backend cycles idle (75.42%) + 7,058,314,336 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (75.42%) + 0.718886614 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.922237e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.931217e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.931217e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.861179 sec - 2,488,986,957 cycles # 2.876 GHz - 6,296,476,670 instructions # 2.53 insn per cycle - 0.887481911 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.534197e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.539839e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.539839e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.078476 sec - 2,048,809,794 cycles # 1.894 GHz - 3,266,667,713 instructions # 1.59 insn per cycle - 1.091634951 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 6f564b583c..c41e2519f4 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,241 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:17:05 +DATE: 2024-03-03_14:55:18 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.665443e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.315182e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.315182e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.468201 sec - 2,060,292,715 cycles # 2.983 GHz - 3,094,906,819 instructions # 1.50 insn per cycle - 0.750075013 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.582299e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.759016e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.759016e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 +TOTAL : 0.444663 sec + 1,257,995,614 cycles:u # 2.712 GHz (74.36%) + 3,323,803 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.32%) + 33,750,386 stalled-cycles-backend:u # 2.68% backend cycles idle (74.26%) + 1,681,094,930 instructions:u # 1.34 insn per cycle + # 0.02 stalled cycles per insn (74.22%) + 0.489024988 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.249943e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.466015e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.466015e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.882218 sec - 6,478,461,444 cycles # 3.059 GHz - 12,879,929,349 instructions # 1.99 insn per cycle - 2.174649918 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.268559e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.711027e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.711027e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.448567 sec + 11,570,440,658 cycles:u # 3.326 GHz (74.91%) + 38,666,267 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.90%) + 1,139,194,577 stalled-cycles-backend:u # 9.85% backend cycles idle (74.95%) + 9,919,737,336 instructions:u # 0.86 insn per cycle + # 0.11 stalled cycles per insn (75.01%) + 3.502089428 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.041429e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.042536e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.042536e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.044775 sec - 24,623,818,516 cycles # 3.060 GHz - 78,132,484,739 instructions # 3.17 insn per cycle - 8.049291657 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.475508e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.476558e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.476558e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.632831 sec + 23,294,407,490 cycles:u # 3.500 GHz (75.00%) + 2,385,816 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 2,694,046,041 stalled-cycles-backend:u # 11.57% backend cycles idle (75.00%) + 75,886,020,623 instructions:u # 3.26 insn per cycle + # 0.04 stalled cycles per insn (75.00%) + 6.657837903 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.498892e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.513186e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.513186e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.197009 sec - 6,464,288,620 cycles # 2.938 GHz - 20,129,426,624 instructions # 3.11 insn per cycle - 2.201352169 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.920792e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.938871e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.938871e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.661196 sec + 5,849,624,843 cycles:u # 3.474 GHz (74.82%) + 744,753 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.82%) + 890,080,654 stalled-cycles-backend:u # 15.22% backend cycles idle (74.87%) + 20,186,191,059 instructions:u # 3.45 insn per cycle + # 0.04 stalled cycles per insn (75.11%) + 1.687832899 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.703352e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.711063e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.711063e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.973161 sec - 2,827,392,405 cycles # 2.894 GHz - 6,998,075,079 instructions # 2.48 insn per cycle - 0.977561277 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.931885e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.940835e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.940835e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.859317 sec - 2,491,742,914 cycles # 2.887 GHz - 6,305,390,293 instructions # 2.53 insn per cycle - 0.863665296 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.381518e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.391943e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.391943e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.696738 sec + 2,468,710,150 cycles:u # 3.431 GHz (74.46%) + 573,862 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.64%) + 272,091,914 stalled-cycles-backend:u # 11.02% backend cycles idle (75.18%) + 7,069,772,671 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (75.56%) + 0.722935447 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.551095e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.557002e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.557002e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.067932 sec - 2,057,227,059 cycles # 1.920 GHz - 3,276,345,738 instructions # 1.59 insn per cycle - 1.072312021 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 66226e8d59..1e1eb5616b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,224 +1,182 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:28:45 +DATE: 2024-03-03_15:05:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.308056e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.358553e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.363626e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.461299 sec - 2,006,885,691 cycles # 2.992 GHz - 3,022,532,155 instructions # 1.51 insn per cycle - 0.728549346 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.496857e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.768561e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.770072e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.441201 sec + 1,236,286,976 cycles:u # 2.684 GHz (74.02%) + 2,734,282 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.50%) + 38,777,979 stalled-cycles-backend:u # 3.14% backend cycles idle (75.81%) + 1,561,318,017 instructions:u # 1.26 insn per cycle + # 0.02 stalled cycles per insn (75.69%) + 0.484851074 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.572531e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.646089e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.649338e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.795584 sec - 6,148,728,410 cycles # 3.042 GHz - 12,326,233,623 instructions # 2.00 insn per cycle - 2.078967785 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.687980e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.723773e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.724201e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.310015 sec + 11,092,246,100 cycles:u # 3.325 GHz (75.06%) + 27,936,907 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.11%) + 1,142,037,661 stalled-cycles-backend:u # 10.30% backend cycles idle (75.12%) + 8,995,510,132 instructions:u # 0.81 insn per cycle + # 0.13 stalled cycles per insn (75.11%) + 3.359963375 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.053824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.054841e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.054841e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 7.994149 sec - 24,620,138,866 cycles # 3.079 GHz - 78,125,377,108 instructions # 3.17 insn per cycle - 7.998228624 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.474042e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.475089e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.475089e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.634873 sec + 23,324,491,000 cycles:u # 3.504 GHz (74.99%) + 1,298,892 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 2,500,617,481 stalled-cycles-backend:u # 10.72% backend cycles idle (75.01%) + 75,884,664,773 instructions:u # 3.25 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 6.659063429 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.346279e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.360483e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.360483e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.242069 sec - 6,461,640,731 cycles # 2.878 GHz - 20,121,052,869 instructions # 3.11 insn per cycle - 2.246196034 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.912450e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.930442e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.930442e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.660516 sec + 5,860,492,825 cycles:u # 3.483 GHz (74.80%) + 714,588 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.80%) + 892,887,399 stalled-cycles-backend:u # 15.24% backend cycles idle (74.84%) + 20,178,833,108 instructions:u # 3.44 insn per cycle + # 0.04 stalled cycles per insn (75.09%) + 1.684440329 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.685316e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.692321e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.692321e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.982986 sec - 2,822,415,829 cycles # 2.862 GHz - 6,987,486,660 instructions # 2.48 insn per cycle - 0.987025186 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.387190e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.397750e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.397750e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.693065 sec + 2,473,715,053 cycles:u # 3.459 GHz (74.36%) + 511,829 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.70%) + 267,325,215 stalled-cycles-backend:u # 10.81% backend cycles idle (75.18%) + 7,067,457,267 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (75.41%) + 0.716919116 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.936405e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.945906e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.945906e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.855808 sec - 2,484,894,865 cycles # 2.892 GHz - 6,291,816,709 instructions # 2.53 insn per cycle - 0.859867773 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.547512e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.553394e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.553394e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.069890 sec - 2,051,026,977 cycles # 1.912 GHz - 3,263,937,559 instructions # 1.59 insn per cycle - 1.073863100 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 29def3747b..5fc39c9ab7 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,227 +1,187 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:22:13 +DATE: 2024-03-03_15:01:48 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.727516e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.381665e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.387640e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.463988 sec - 2,009,660,419 cycles # 2.987 GHz - 3,043,780,102 instructions # 1.51 insn per cycle - 0.732052318 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.593091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.763833e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.764954e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 +TOTAL : 0.438026 sec + 1,247,714,977 cycles:u # 2.700 GHz (74.14%) + 3,183,765 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.84%) + 40,804,785 stalled-cycles-backend:u # 3.27% backend cycles idle (75.76%) + 1,566,002,893 instructions:u # 1.26 insn per cycle + # 0.03 stalled cycles per insn (75.65%) + 0.481169073 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.463642e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.641012e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.644220e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.829361 sec - 6,179,090,687 cycles # 3.005 GHz - 13,497,023,724 instructions # 2.18 insn per cycle - 2.119489112 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.300030e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.726516e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.726958e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.412260 sec + 11,498,941,344 cycles:u # 3.341 GHz (75.04%) + 39,222,891 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.01%) + 1,145,420,210 stalled-cycles-backend:u # 9.96% backend cycles idle (74.90%) + 9,903,387,147 instructions:u # 0.86 insn per cycle + # 0.12 stalled cycles per insn (74.90%) + 3.462061630 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.033662e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.034665e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.034665e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.072340 sec - 24,646,233,583 cycles # 3.055 GHz - 78,130,465,005 instructions # 3.17 insn per cycle - 8.076398723 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.466591e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.467646e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.467646e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.654970 sec + 23,382,894,327 cycles:u # 3.502 GHz (74.96%) + 1,285,649 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 2,698,624,585 stalled-cycles-backend:u # 11.54% backend cycles idle (74.96%) + 75,918,146,842 instructions:u # 3.25 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 6.679227111 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.437406e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.451013e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.451013e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.213064 sec - 6,463,144,308 cycles # 2.916 GHz - 20,121,040,605 instructions # 3.11 insn per cycle - 2.217197026 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.948738e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.967099e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.967099e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.654520 sec + 5,849,988,183 cycles:u # 3.489 GHz (74.62%) + 733,925 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.86%) + 893,308,966 stalled-cycles-backend:u # 15.27% backend cycles idle (75.19%) + 20,137,311,738 instructions:u # 3.44 insn per cycle + # 0.04 stalled cycles per insn (75.20%) + 1.678437046 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.690865e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.698060e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.698060e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.977816 sec - 2,816,932,981 cycles # 2.871 GHz - 6,987,870,279 instructions # 2.48 insn per cycle - 0.981891147 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.386144e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.396641e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.396641e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.693313 sec + 2,474,987,335 cycles:u # 3.461 GHz (74.36%) + 561,279 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.84%) + 260,961,501 stalled-cycles-backend:u # 10.54% backend cycles idle (75.32%) + 7,061,622,493 instructions:u # 2.85 insn per cycle + # 0.04 stalled cycles per insn (75.41%) + 0.716991593 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.925443e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.934689e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.934689e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.859893 sec - 2,483,713,955 cycles # 2.877 GHz - 6,295,351,555 instructions # 2.53 insn per cycle - 0.863911879 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.552325e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.558086e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.558086e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.064299 sec - 2,046,605,748 cycles # 1.917 GHz - 3,265,707,472 instructions # 1.60 insn per cycle - 1.068273671 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 50b444080d..971e76956f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:34:14 +DATE: 2024-03-03_14:10:32 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.321381e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.374979e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.380502e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.482038 sec - 2,083,496,491 cycles # 2.987 GHz - 3,090,021,729 instructions # 1.48 insn per cycle - 0.780369869 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.566678e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.761241e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.762888e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 +TOTAL : 0.424948 sec + 1,151,762,649 cycles:u # 2.598 GHz (74.97%) + 2,181,608 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.88%) + 5,563,008 stalled-cycles-backend:u # 0.48% backend cycles idle (74.76%) + 1,556,455,425 instructions:u # 1.35 insn per cycle + # 0.00 stalled cycles per insn (74.71%) + 0.469666085 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.505248e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.577137e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.580211e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.719742 sec - 5,952,430,615 cycles # 3.047 GHz - 11,750,571,480 instructions # 1.97 insn per cycle - 2.009992190 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.731093e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.758598e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.759042e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 +TOTAL : 2.607659 sec + 8,768,629,457 cycles:u # 3.332 GHz (74.72%) + 2,624,104 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.93%) + 5,576,888 stalled-cycles-backend:u # 0.06% backend cycles idle (74.95%) + 7,487,633,184 instructions:u # 0.85 insn per cycle + # 0.00 stalled cycles per insn (75.08%) + 2.658904547 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.039243e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.040268e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.040268e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.050624 sec - 24,577,706,132 cycles # 3.054 GHz - 77,857,469,800 instructions # 3.17 insn per cycle - 8.057072902 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.471528e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.472574e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.472574e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.641416 sec + 23,353,702,510 cycles:u # 3.505 GHz (74.93%) + 1,362,469 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 2,617,885,041 stalled-cycles-backend:u # 11.21% backend cycles idle (75.03%) + 75,791,709,024 instructions:u # 3.25 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 6.666120159 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866268634797E-004 -Relative difference = 5.630135835748959e-08 +Avg ME (F77/C++) = 6.6274866108667618E-004 +Relative difference = 5.871505118544242e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.236562e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.248995e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.248995e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.274363 sec - 6,415,212,085 cycles # 2.816 GHz - 20,086,390,532 instructions # 3.13 insn per cycle - 2.288238797 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.955947e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.973665e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.973665e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.653154 sec + 5,851,964,104 cycles:u # 3.492 GHz (74.57%) + 716,574 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.80%) + 873,414,641 stalled-cycles-backend:u # 14.93% backend cycles idle (75.18%) + 20,127,366,441 instructions:u # 3.44 insn per cycle + # 0.04 stalled cycles per insn (75.18%) + 1.679093232 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861465384638E-004 -Relative difference = 2.211071647257023e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.636656e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.643300e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.643300e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.010969 sec - 2,918,129,602 cycles # 2.878 GHz - 7,130,827,098 instructions # 2.44 insn per cycle - 1.024648825 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.371919e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.382165e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.382165e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.697310 sec + 2,486,415,833 cycles:u # 3.455 GHz (74.08%) + 636,088 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.64%) + 320,151,616 stalled-cycles-backend:u # 12.88% backend cycles idle (75.56%) + 7,058,223,186 instructions:u # 2.84 insn per cycle + # 0.05 stalled cycles per insn (75.57%) + 0.723213530 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11569) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668077068E-004 -Relative difference = 5.008498817890231e-09 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.848024e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.856123e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.856123e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.895519 sec - 2,583,274,132 cycles # 2.873 GHz - 6,439,451,842 instructions # 2.49 insn per cycle - 0.910176239 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668077068E-004 -Relative difference = 5.008498817890231e-09 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.488982e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.494377e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.494377e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.109477 sec - 2,120,739,457 cycles # 1.905 GHz - 3,428,489,642 instructions # 1.62 insn per cycle - 1.120804955 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2911) (512y: 22) (512z: 9647) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952032322112E-004 -Relative difference = 3.066639970473621e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 3e610d68fd..9b9d6daf6d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:07:56 +DATE: 2024-03-03_14:37:41 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.548079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.594396e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.599390e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.487762 sec - 2,117,397,644 cycles # 2.979 GHz - 3,170,491,357 instructions # 1.50 insn per cycle - 0.771619877 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.588992e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.775838e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.777533e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 +TOTAL : 0.424815 sec + 1,147,914,206 cycles:u # 2.596 GHz (75.50%) + 2,137,582 stalled-cycles-frontend:u # 0.19% frontend cycles idle (76.49%) + 4,985,293 stalled-cycles-backend:u # 0.43% backend cycles idle (76.40%) + 1,569,527,290 instructions:u # 1.37 insn per cycle + # 0.00 stalled cycles per insn (75.34%) + 0.469563386 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.728616e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.789567e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.792128e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.852993 sec - 6,403,206,858 cycles # 3.066 GHz - 13,984,822,985 instructions # 2.18 insn per cycle - 2.145838793 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.702922e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.730822e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.731253e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 +TOTAL : 2.628763 sec + 8,748,097,863 cycles:u # 3.316 GHz (74.96%) + 2,437,447 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.13%) + 4,749,082 stalled-cycles-backend:u # 0.05% backend cycles idle (75.19%) + 7,394,823,172 instructions:u # 0.85 insn per cycle + # 0.00 stalled cycles per insn (75.12%) + 2.677900313 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.747654e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.748466e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.748466e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.541681 sec - 87,683,123,741 cycles # 3.072 GHz - 135,626,627,328 instructions # 1.55 insn per cycle - 28.545959109 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15563) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.839569e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.840163e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.840163e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 +TOTAL : 28.090715 sec + 98,526,254,999 cycles:u # 3.505 GHz (74.98%) + 422,342,769 stalled-cycles-frontend:u # 0.43% frontend cycles idle (75.00%) + 5,651,142,068 stalled-cycles-backend:u # 5.74% backend cycles idle (75.01%) + 134,061,336,373 instructions:u # 1.36 insn per cycle + # 0.04 stalled cycles per insn (75.02%) + 28.115646344 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16252) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275340277317796E-004 -Relative difference = 4.184328521943034e-09 +Avg ME (F77/C++) = 6.6275340697351248E-004 +Relative difference = 1.052203199451665e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.148984e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.161699e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.161699e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.302428 sec - 6,776,067,855 cycles # 2.939 GHz - 19,386,467,667 instructions # 2.86 insn per cycle - 2.306810458 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.281572e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.294180e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.294180e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 1.986665 sec + 6,998,627,436 cycles:u # 3.483 GHz (74.92%) + 4,972,896 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.92%) + 3,166,433,191 stalled-cycles-backend:u # 45.24% backend cycles idle (74.92%) + 19,197,411,512 instructions:u # 2.74 insn per cycle + # 0.16 stalled cycles per insn (74.95%) + 2.012844995 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862707273868E-004 -Relative difference = 4.0849182767952624e-08 +Avg ME (F77/C++) = 6.6274857053714997E-004 +Relative difference = 4.445554471174176e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.506728e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.512574e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.512574e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.096393 sec - 3,175,310,502 cycles # 2.890 GHz - 6,807,675,147 instructions # 2.14 insn per cycle - 1.100557110 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.474226e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.478244e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.478244e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 1.118881 sec + 3,963,587,799 cycles:u # 3.473 GHz (74.82%) + 618,981 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.79%) + 2,252,906,025 stalled-cycles-backend:u # 56.84% backend cycles idle (74.79%) + 6,771,275,605 instructions:u # 1.71 insn per cycle + # 0.33 stalled cycles per insn (74.83%) + 1.145186082 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48607) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731558747466E-004 -Relative difference = 2.3520194007978538e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627274e-04 +Avg ME (F77/C++) = 6.6272735722101156E-004 +Relative difference = 6.454990161554483e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.815661e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.823746e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.823746e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.911313 sec - 2,641,911,907 cycles # 2.888 GHz - 5,985,989,672 instructions # 2.27 insn per cycle - 0.915610697 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731558747466E-004 -Relative difference = 2.3520194007978538e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.523255e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.528884e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.528884e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.084772 sec - 2,074,111,548 cycles # 1.906 GHz - 3,500,542,355 instructions # 1.69 insn per cycle - 1.089027435 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5197) (512y: 3) (512z:44822) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750363879224E-004 -Relative difference = 5.490631193034436e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index f668536073..3abe53f650 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:08:48 +DATE: 2024-03-03_14:38:32 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.541557e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.588429e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.593399e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.485011 sec - 2,123,544,393 cycles # 3.007 GHz - 3,219,525,664 instructions # 1.52 insn per cycle - 0.766064420 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.550920e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.752610e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.754230e+05 ) sec^-1 +MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 +TOTAL : 0.432609 sec + 1,170,709,207 cycles:u # 2.632 GHz (75.51%) + 2,245,635 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.63%) + 5,034,999 stalled-cycles-backend:u # 0.43% backend cycles idle (75.16%) + 1,568,756,660 instructions:u # 1.34 insn per cycle + # 0.00 stalled cycles per insn (74.90%) + 0.477253690 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.637487e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.696462e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.698981e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.858325 sec - 6,401,876,626 cycles # 3.056 GHz - 13,834,352,039 instructions # 2.16 insn per cycle - 2.151127842 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.719669e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.747428e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.747865e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 +TOTAL : 2.614994 sec + 8,774,192,257 cycles:u # 3.326 GHz (74.88%) + 2,623,817 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.96%) + 5,376,553 stalled-cycles-backend:u # 0.06% backend cycles idle (75.13%) + 7,407,357,792 instructions:u # 0.84 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 2.665747459 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.762616e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.763465e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.763465e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.469746 sec - 87,566,965,728 cycles # 3.076 GHz - 135,909,521,186 instructions # 1.55 insn per cycle - 28.473960910 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15910) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.805792e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.806376e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.806376e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 +TOTAL : 28.253870 sec + 99,085,121,864 cycles:u # 3.504 GHz (74.99%) + 403,753,080 stalled-cycles-frontend:u # 0.41% frontend cycles idle (74.99%) + 5,989,522,868 stalled-cycles-backend:u # 6.04% backend cycles idle (74.99%) + 133,996,319,049 instructions:u # 1.35 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 28.278762237 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16105) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275352674967369E-004 -Relative difference = 4.0361421941458736e-08 +Avg ME (F77/C++) = 6.6275346486299042E-004 +Relative difference = 5.301670926116898e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.141246e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.153468e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.153468e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.304055 sec - 6,854,008,563 cycles # 2.972 GHz - 19,438,508,034 instructions # 2.84 insn per cycle - 2.308246423 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.151371e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.163159e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.163159e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 2.018070 sec + 7,109,141,926 cycles:u # 3.483 GHz (74.92%) + 6,794,771 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.92%) + 2,870,898,971 stalled-cycles-backend:u # 40.38% backend cycles idle (74.92%) + 19,267,656,430 instructions:u # 2.71 insn per cycle + # 0.15 stalled cycles per insn (74.95%) + 2.044423316 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862764021530E-004 -Relative difference = 4.170542995014107e-08 +Avg ME (F77/C++) = 6.6274857044990032E-004 +Relative difference = 4.4587192899226015e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.543089e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.548736e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.548736e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.070827 sec - 3,111,432,280 cycles # 2.896 GHz - 6,718,585,544 instructions # 2.16 insn per cycle - 1.075017514 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.507945e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.512087e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.512087e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 1.093716 sec + 3,878,852,536 cycles:u # 3.474 GHz (74.94%) + 558,959 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.92%) + 2,192,907,076 stalled-cycles-backend:u # 56.53% backend cycles idle (74.92%) + 6,707,121,317 instructions:u # 1.73 insn per cycle + # 0.33 stalled cycles per insn (74.93%) + 1.119771463 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47398) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731651051409E-004 -Relative difference = 2.4912983202981302e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627274e-04 +Avg ME (F77/C++) = 6.6272735755491807E-004 +Relative difference = 6.404606472340801e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.837542e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.845711e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.845711e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.900474 sec - 2,630,752,588 cycles # 2.910 GHz - 5,969,340,561 instructions # 2.27 insn per cycle - 0.904647261 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731651051409E-004 -Relative difference = 2.4912983202981302e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.526039e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.531935e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.531935e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.083027 sec - 2,083,719,160 cycles # 1.918 GHz - 3,494,111,175 instructions # 1.68 insn per cycle - 1.087325959 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4161) (512y: 4) (512z:44465) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750384530066E-004 -Relative difference = 5.80223501432476e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 8553820a52..2ab49e3bd6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:34:44 +DATE: 2024-03-03_14:10:58 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.473478e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.502235e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.504525e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.522907 sec - 2,248,416,129 cycles # 2.981 GHz - 3,483,881,112 instructions # 1.55 insn per cycle - 0.829467781 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.425592e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.587766e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.588951e+04 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.638584 sec + 1,919,574,096 cycles:u # 2.923 GHz (74.70%) + 2,421,805 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.49%) + 5,597,340 stalled-cycles-backend:u # 0.29% backend cycles idle (74.97%) + 2,103,364,228 instructions:u # 1.10 insn per cycle + # 0.00 stalled cycles per insn (75.54%) + 0.684865412 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.123898e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.157734e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.159130e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.035491 sec - 10,039,386,860 cycles # 3.052 GHz - 22,522,898,713 instructions # 2.24 insn per cycle - 3.349083086 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.244551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.247201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.247258e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 7.688882 sec + 26,503,223,516 cycles:u # 3.435 GHz (75.02%) + 3,266,324 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 4,510,741 stalled-cycles-backend:u # 0.02% backend cycles idle (74.96%) + 21,089,006,992 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 7.745515345 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.952639e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.953615e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.953615e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.409354 sec - 25,927,870,734 cycles # 3.082 GHz - 79,436,480,305 instructions # 3.06 insn per cycle - 8.416137774 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.210580e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.211471e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.211471e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.426674 sec + 26,082,399,852 cycles:u # 3.501 GHz (74.98%) + 8,305,754 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.98%) + 3,453,424,669 stalled-cycles-backend:u # 13.24% backend cycles idle (74.98%) + 82,445,002,864 instructions:u # 3.16 insn per cycle + # 0.04 stalled cycles per insn (74.98%) + 7.451680513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.739028e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.742372e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.742372e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.395641 sec - 12,641,926,900 cycles # 2.873 GHz - 38,549,360,435 instructions # 3.05 insn per cycle - 4.411574958 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.104934e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.109669e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.109669e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.220409 sec + 11,345,813,930 cycles:u # 3.499 GHz (74.79%) + 3,530,047 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.90%) + 1,380,647,933 stalled-cycles-backend:u # 12.17% backend cycles idle (75.09%) + 38,530,758,808 instructions:u # 3.40 insn per cycle + # 0.04 stalled cycles per insn (75.09%) + 3.246786684 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.720558e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.737987e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.737987e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.889905 sec - 5,503,418,397 cycles # 2.905 GHz - 13,481,227,468 instructions # 2.45 insn per cycle - 1.901949052 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.216330e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.218977e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.218977e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.356050 sec + 4,800,488,094 cycles:u # 3.482 GHz (74.92%) + 704,363 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.05%) + 500,136,585 stalled-cycles-backend:u # 10.42% backend cycles idle (75.05%) + 13,595,981,507 instructions:u # 2.83 insn per cycle + # 0.04 stalled cycles per insn (75.06%) + 1.382171821 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.817789e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.841302e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.841302e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.679659 sec - 4,858,057,374 cycles # 2.885 GHz - 12,135,455,571 instructions # 2.50 insn per cycle - 1.694768152 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.171224e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.183880e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.183880e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.297248 sec - 4,143,595,621 cycles # 1.801 GHz - 6,336,694,490 instructions # 1.53 insn per cycle - 2.312628428 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 44d560fb63..f26a504a43 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:35:21 +DATE: 2024-03-03_14:11:34 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.474402e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.502829e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.505143e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.522485 sec - 2,266,664,443 cycles # 3.011 GHz - 3,552,942,464 instructions # 1.57 insn per cycle - 0.824080628 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.391332e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.447459e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.447953e+05 ) sec^-1 +MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 +TOTAL : 0.516393 sec + 1,446,030,568 cycles:u # 2.710 GHz (76.02%) + 2,133,442 stalled-cycles-frontend:u # 0.15% frontend cycles idle (76.04%) + 5,617,954 stalled-cycles-backend:u # 0.39% backend cycles idle (75.92%) + 1,827,006,654 instructions:u # 1.26 insn per cycle + # 0.00 stalled cycles per insn (74.74%) + 0.563344751 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.147340e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181695e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.182993e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.023944 sec - 10,029,910,184 cycles # 3.059 GHz - 21,497,951,661 instructions # 2.14 insn per cycle - 3.338904131 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.737172e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.742314e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.742436e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 +TOTAL : 6.349317 sec + 21,791,711,777 cycles:u # 3.417 GHz (74.97%) + 3,055,571 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 4,679,620 stalled-cycles-backend:u # 0.02% backend cycles idle (75.02%) + 17,472,685,439 instructions:u # 0.80 insn per cycle + # 0.00 stalled cycles per insn (75.09%) + 6.404127880 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.924823e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.925747e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.925747e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.531114 sec - 25,939,606,781 cycles # 3.040 GHz - 79,447,311,630 instructions # 3.06 insn per cycle - 8.537643841 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.204504e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.205382e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.205382e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.446882 sec + 26,151,941,870 cycles:u # 3.502 GHz (74.94%) + 11,506,078 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.95%) + 3,429,320,097 stalled-cycles-backend:u # 13.11% backend cycles idle (74.99%) + 82,356,606,752 instructions:u # 3.15 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 7.471566166 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.758654e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.761985e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.761985e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.372440 sec - 12,693,692,693 cycles # 2.901 GHz - 38,521,475,204 instructions # 3.03 insn per cycle - 4.385193423 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.110923e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.115571e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.115571e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.216374 sec + 11,331,264,346 cycles:u # 3.499 GHz (74.88%) + 3,860,751 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.06%) + 1,226,594,077 stalled-cycles-backend:u # 10.82% backend cycles idle (75.06%) + 38,556,360,425 instructions:u # 3.40 insn per cycle + # 0.03 stalled cycles per insn (75.06%) + 3.242722707 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.635318e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.652109e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.652109e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.908191 sec - 5,531,901,200 cycles # 2.893 GHz - 13,605,961,475 instructions # 2.46 insn per cycle - 1.920337987 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.221585e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.224270e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.224270e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.350107 sec + 4,758,515,367 cycles:u # 3.466 GHz (74.95%) + 982,235 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) + 451,795,587 stalled-cycles-backend:u # 9.49% backend cycles idle (74.95%) + 13,618,792,974 instructions:u # 2.86 insn per cycle + # 0.03 stalled cycles per insn (74.95%) + 1.376348821 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10908) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.704499e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.725961e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.725961e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.699452 sec - 4,910,284,170 cycles # 2.883 GHz - 12,271,024,564 instructions # 2.50 insn per cycle - 1.712563313 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.567240e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.580886e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.580886e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.177959 sec - 4,164,411,217 cycles # 1.910 GHz - 6,442,301,345 instructions # 1.55 insn per cycle - 2.190574077 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 93119c7539..7372ffa56a 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:37:42 +DATE: 2024-03-03_14:13:32 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.065457e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.065836e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.065940e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.448496 sec - 8,082,390,398 cycles # 2.946 GHz - 16,852,562,382 instructions # 2.09 insn per cycle - 2.848455369 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.010631e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.015965e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.016022e+01 ) sec^-1 +MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 +TOTAL : 9.444933 sec + 32,646,179,524 cycles:u # 3.456 GHz (74.93%) + 3,576,067 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 6,600,395 stalled-cycles-backend:u # 0.02% backend cycles idle (74.99%) + 25,762,478,293 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 9.495322400 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.245006e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.247251e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.247453e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.002127 sec - 13,348,526,839 cycles # 3.088 GHz - 31,140,905,358 instructions # 2.33 insn per cycle - 4.382097820 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.553385e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.557014e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.557039e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 +TOTAL : 8.969163 sec + 30,985,581,306 cycles:u # 3.449 GHz (75.00%) + 3,676,317 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 6,175,773 stalled-cycles-backend:u # 0.02% backend cycles idle (74.93%) + 24,530,344,469 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (74.96%) + 9.014444906 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.053587e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.053836e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.053836e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.566168 sec - 18,831,689,747 cycles # 2.868 GHz - 53,916,332,004 instructions # 2.86 insn per cycle - 6.572689464 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.024281e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.024308e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.024308e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.156094 sec + 18,123,944,756 cycles:u # 3.500 GHz (74.97%) + 25,307,192 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.97%) + 2,201,506,124 stalled-cycles-backend:u # 12.15% backend cycles idle (74.97%) + 55,196,127,858 instructions:u # 3.05 insn per cycle + # 0.04 stalled cycles per insn (74.98%) + 5.180638609 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.663489e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.663581e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.663581e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.182674 sec - 9,806,871,766 cycles # 3.081 GHz - 27,093,022,297 instructions # 2.76 insn per cycle - 3.192772007 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.242509e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.242634e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.242634e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.355912 sec + 8,324,081,906 cycles:u # 3.500 GHz (74.90%) + 1,258,174 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) + 790,392,106 stalled-cycles-backend:u # 9.50% backend cycles idle (75.11%) + 27,094,079,183 instructions:u # 3.25 insn per cycle + # 0.03 stalled cycles per insn (74.95%) + 2.381888397 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.630162e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.630605e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.630605e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.462430 sec - 4,231,767,010 cycles # 2.892 GHz - 9,562,001,834 instructions # 2.26 insn per cycle - 1.472832936 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.215964e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.216635e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.216635e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.014015 sec + 3,603,710,395 cycles:u # 3.477 GHz (74.59%) + 959,353 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.92%) + 291,639,930 stalled-cycles-backend:u # 8.09% backend cycles idle (75.25%) + 9,572,736,438 instructions:u # 2.66 insn per cycle + # 0.03 stalled cycles per insn (75.31%) + 1.039798359 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.135973e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.136556e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.136556e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.282131 sec - 3,734,243,960 cycles # 2.905 GHz - 8,486,594,514 instructions # 2.27 insn per cycle - 1.294140643 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.702281e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.702851e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.702851e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.432645 sec - 2,701,519,987 cycles # 1.882 GHz - 4,274,080,381 instructions # 1.58 insn per cycle - 1.444722496 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 7163808f45..0009f2aa16 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,241 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_03:17:34 +DATE: 2024-03-03_14:55:46 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.068445e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.069395e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.069395e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.373786 sec - 8,212,794,649 cycles # 3.050 GHz - 17,373,508,782 instructions # 2.12 insn per cycle - 2.749788140 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.066117e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.066962e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.066962e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 9.400568 sec + 32,571,712,804 cycles:u # 3.457 GHz (74.98%) + 3,816,546 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 10,306,318 stalled-cycles-backend:u # 0.03% backend cycles idle (74.96%) + 25,702,777,180 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 9.451158645 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.191805e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.223957e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.223957e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.992060 sec - 13,207,906,873 cycles # 3.062 GHz - 30,525,969,027 instructions # 2.31 insn per cycle - 4.371813741 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.555020e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.558714e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.558714e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 8.996817 sec + 31,094,202,940 cycles:u # 3.448 GHz (74.98%) + 4,612,912 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 60,925,647 stalled-cycles-backend:u # 0.20% backend cycles idle (74.97%) + 24,633,781,700 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 9.045426867 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.148706e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.148931e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.148931e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.484661 sec - 18,737,465,302 cycles # 2.888 GHz - 53,915,906,594 instructions # 2.88 insn per cycle - 6.488680620 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.018995e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.019024e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.019024e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.182058 sec + 18,215,591,702 cycles:u # 3.500 GHz (74.94%) + 24,842,932 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.94%) + 2,236,448,537 stalled-cycles-backend:u # 12.28% backend cycles idle (74.94%) + 55,230,783,802 instructions:u # 3.03 insn per cycle + # 0.04 stalled cycles per insn (74.96%) + 5.207347443 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.664837e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.664944e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.664944e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.177972 sec - 9,794,551,146 cycles # 3.079 GHz - 27,093,049,280 instructions # 2.77 insn per cycle - 3.182112356 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.246982e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.247111e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.247111e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.351645 sec + 8,293,281,219 cycles:u # 3.493 GHz (74.99%) + 576,460 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.07%) + 728,986,715 stalled-cycles-backend:u # 8.79% backend cycles idle (75.07%) + 27,045,320,734 instructions:u # 3.26 insn per cycle + # 0.03 stalled cycles per insn (75.07%) + 2.377713290 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.541461e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.541883e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.541883e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.495047 sec - 4,300,282,840 cycles # 2.870 GHz - 9,561,701,370 instructions # 2.22 insn per cycle - 1.499121189 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 5.249531e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.250288e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.250288e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.007623 sec + 3,594,206,633 cycles:u # 3.491 GHz (74.73%) + 1,276,276 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.02%) + 288,960,146 stalled-cycles-backend:u # 8.04% backend cycles idle (75.14%) + 9,597,826,038 instructions:u # 2.67 insn per cycle + # 0.03 stalled cycles per insn (75.15%) + 1.033168851 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.118490e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.119048e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.119048e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.287264 sec - 3,730,461,014 cycles # 2.891 GHz - 8,485,603,542 instructions # 2.27 insn per cycle - 1.291227222 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.742786e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.743427e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.743427e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.415968 sec - 2,690,639,160 cycles # 1.896 GHz - 4,273,336,878 instructions # 1.59 insn per cycle - 1.420067464 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index fcaae9673e..0a6a2db2de 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:38:46 +DATE: 2024-03-03_14:15:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.066781e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.067205e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.067339e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.446944 sec - 8,408,759,874 cycles # 3.068 GHz - 18,673,492,162 instructions # 2.22 insn per cycle - 2.843675081 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.061342e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.067173e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.067226e+01 ) sec^-1 +MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 +TOTAL : 9.363420 sec + 32,427,252,088 cycles:u # 3.456 GHz (74.94%) + 3,678,760 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 6,561,717 stalled-cycles-backend:u # 0.02% backend cycles idle (75.04%) + 25,590,076,713 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.07%) + 9.410773685 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.258123e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.260337e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.260588e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.986190 sec - 13,309,313,958 cycles # 3.084 GHz - 29,253,936,467 instructions # 2.20 insn per cycle - 4.370982628 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.558322e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.561699e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.561723e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 +TOTAL : 8.970763 sec + 31,031,440,441 cycles:u # 3.452 GHz (75.02%) + 3,594,598 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.06%) + 7,127,107 stalled-cycles-backend:u # 0.02% backend cycles idle (75.00%) + 24,539,030,272 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 9.017990720 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.505940e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.506196e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.506196e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.219195 sec - 18,809,079,145 cycles # 3.025 GHz - 53,925,834,666 instructions # 2.87 insn per cycle - 6.232860023 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.027524e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.027551e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.027551e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.139911 sec + 18,076,730,884 cycles:u # 3.502 GHz (74.91%) + 27,247,063 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.97%) + 2,156,497,484 stalled-cycles-backend:u # 11.93% backend cycles idle (75.04%) + 55,173,526,576 instructions:u # 3.05 insn per cycle + # 0.04 stalled cycles per insn (75.05%) + 5.164392308 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.661174e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.661266e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.661266e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.189478 sec - 9,805,870,159 cycles # 3.076 GHz - 27,091,831,447 instructions # 2.76 insn per cycle - 3.203897537 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.239128e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.239256e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.239256e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.359467 sec + 8,324,328,252 cycles:u # 3.495 GHz (74.84%) + 1,018,855 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.75%) + 810,197,061 stalled-cycles-backend:u # 9.73% backend cycles idle (74.92%) + 27,094,690,757 instructions:u # 3.25 insn per cycle + # 0.03 stalled cycles per insn (75.14%) + 2.385087332 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.622791e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.623217e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.623217e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.464714 sec - 4,224,699,489 cycles # 2.882 GHz - 9,562,401,622 instructions # 2.26 insn per cycle - 1.476328883 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.196329e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.196991e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.196991e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.017365 sec + 3,618,316,263 cycles:u # 3.481 GHz (74.61%) + 1,889,874 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.43%) + 305,653,445 stalled-cycles-backend:u # 8.45% backend cycles idle (74.81%) + 9,597,446,711 instructions:u # 2.65 insn per cycle + # 0.03 stalled cycles per insn (75.39%) + 1.043068302 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84231) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.104704e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.105332e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.105332e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.294499 sec - 3,723,740,700 cycles # 2.874 GHz - 8,486,051,495 instructions # 2.28 insn per cycle - 1.308410916 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.737812e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.738457e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.738457e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.421818 sec - 2,699,411,216 cycles # 1.899 GHz - 4,277,531,970 instructions # 1.58 insn per cycle - 1.435104148 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index e89ab34326..9f7921db81 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:39:49 +DATE: 2024-03-03_14:16:52 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.768224e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.769082e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.769342e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.706494 sec - 5,724,877,835 cycles # 2.946 GHz - 11,350,286,337 instructions # 1.98 insn per cycle - 2.064496697 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.808345e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.812077e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.812115e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.256794e-06 +- 4.775721e-07 ) GeV^-6 +TOTAL : 4.492549 sec + 15,357,555,686 cycles:u # 3.403 GHz (75.00%) + 2,789,283 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) + 6,530,614 stalled-cycles-backend:u # 0.04% backend cycles idle (75.02%) + 12,484,001,278 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.09%) + 4.539017619 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.316243e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.317022e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.317120e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.926202 sec - 6,794,636,243 cycles # 3.076 GHz - 13,931,883,029 instructions # 2.05 insn per cycle - 2.265774235 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.381501e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.397017e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.397171e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.930014e-02 +- 1.363569e-02 ) GeV^-6 +TOTAL : 4.646461 sec + 15,935,601,520 cycles:u # 3.416 GHz (74.91%) + 2,847,425 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.97%) + 7,051,629 stalled-cycles-backend:u # 0.04% backend cycles idle (74.95%) + 12,917,649,233 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 4.690156490 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.967764e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.968029e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.968029e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.899633 sec - 18,012,008,843 cycles # 3.055 GHz - 53,588,806,253 instructions # 2.98 insn per cycle - 5.906269981 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.105909e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.105943e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.105943e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 4.775823 sec + 16,793,710,535 cycles:u # 3.500 GHz (74.99%) + 14,312,647 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.99%) + 1,786,126,710 stalled-cycles-backend:u # 10.64% backend cycles idle (74.99%) + 51,809,018,697 instructions:u # 3.09 insn per cycle + # 0.03 stalled cycles per insn (74.99%) + 4.800324805 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087414119E-003 +Relative difference = 2.1196409216982896e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.554445e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.554907e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.554907e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.492504 sec - 4,596,969,768 cycles # 3.077 GHz - 13,763,413,131 instructions # 2.99 insn per cycle - 1.508036951 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.600191e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.600727e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.600727e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.149306 sec + 4,082,382,206 cycles:u # 3.485 GHz (74.76%) + 1,225,467 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.74%) + 423,325,187 stalled-cycles-backend:u # 10.37% backend cycles idle (74.55%) + 13,835,448,548 instructions:u # 3.39 insn per cycle + # 0.03 stalled cycles per insn (74.90%) + 1.175151715 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896527003E-003 -Relative difference = 3.151388282563952e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.129307e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.130988e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.130988e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.749250 sec - 2,146,538,234 cycles # 2.864 GHz - 4,817,770,938 instructions # 2.24 insn per cycle - 0.763621351 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.041507e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041782e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.041782e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.508518 sec + 1,834,426,422 cycles:u # 3.458 GHz (74.41%) + 689,353 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.37%) + 157,999,464 stalled-cycles-backend:u # 8.61% backend cycles idle (74.48%) + 4,889,172,809 instructions:u # 2.67 insn per cycle + # 0.03 stalled cycles per insn (74.93%) + 0.533897559 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.184924e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.187225e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.187225e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.652928 sec - 1,865,233,671 cycles # 2.849 GHz - 4,274,819,205 instructions # 2.29 insn per cycle - 0.666710238 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.469221e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.471533e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.471533e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.715424 sec - 1,360,172,621 cycles # 1.900 GHz - 2,159,744,323 instructions # 1.59 insn per cycle - 0.729957103 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982958280E-003 -Relative difference = 2.0044092642523172e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 684ca24c1f..4777b62c5f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,241 +1,196 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_03:18:37 +DATE: 2024-03-03_14:57:26 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.798857e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.800593e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.800593e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.598425 sec - 5,724,594,753 cycles # 3.063 GHz - 12,186,790,592 instructions # 2.13 insn per cycle - 1.928350107 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.842786e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.843194e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.843194e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.935145e-03 +- 4.929588e-03 ) GeV^-6 +TOTAL : 4.410058 sec + 15,108,619,628 cycles:u # 3.409 GHz (75.02%) + 2,813,974 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.99%) + 6,668,807 stalled-cycles-backend:u # 0.04% backend cycles idle (74.92%) + 12,277,562,361 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (74.92%) + 4.458755435 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.285950e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.298387e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.298387e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.887489 sec - 6,620,617,732 cycles # 3.045 GHz - 14,303,245,528 instructions # 2.16 insn per cycle - 2.231962749 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.356565e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.372551e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.372551e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.258769e+00 +- 1.256832e+00 ) GeV^-6 +TOTAL : 4.661126 sec + 16,000,894,811 cycles:u # 3.416 GHz (74.90%) + 3,664,056 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.88%) + 46,279,379 stalled-cycles-backend:u # 0.29% backend cycles idle (74.90%) + 12,965,129,778 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 4.705622209 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.094412e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.094687e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.094687e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.812831 sec - 17,931,583,834 cycles # 3.083 GHz - 53,588,775,363 instructions # 2.99 insn per cycle - 5.816760256 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.098781e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.098814e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.098814e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 4.806991 sec + 16,893,766,185 cycles:u # 3.498 GHz (74.99%) + 16,568,104 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.99%) + 1,867,774,628 stalled-cycles-backend:u # 11.06% backend cycles idle (74.99%) + 51,769,420,335 instructions:u # 3.06 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 4.831888692 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087414119E-003 +Relative difference = 2.1196409216982896e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.573130e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.573569e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.573569e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.483014 sec - 4,585,157,051 cycles # 3.085 GHz - 13,762,636,955 instructions # 3.00 insn per cycle - 1.487033664 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.623898e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.624434e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.624434e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.143685 sec + 4,064,180,023 cycles:u # 3.486 GHz (74.69%) + 655,050 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.48%) + 415,271,617 stalled-cycles-backend:u # 10.22% backend cycles idle (74.82%) + 13,799,508,244 instructions:u # 3.40 insn per cycle + # 0.03 stalled cycles per insn (75.30%) + 1.169308254 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896527003E-003 -Relative difference = 3.151388282563952e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.234993e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.236702e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.236702e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.734407 sec - 2,124,324,714 cycles # 2.880 GHz - 4,817,114,861 instructions # 2.27 insn per cycle - 0.738469635 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.044087e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.044350e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.044350e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.507458 sec + 1,819,280,916 cycles:u # 3.436 GHz (74.32%) + 555,295 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.43%) + 147,367,438 stalled-cycles-backend:u # 8.10% backend cycles idle (74.91%) + 4,855,416,131 instructions:u # 2.67 insn per cycle + # 0.03 stalled cycles per insn (75.56%) + 0.532829777 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.746826e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.748881e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.748881e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.686036 sec - 1,868,608,359 cycles # 2.710 GHz - 4,274,464,507 instructions # 2.29 insn per cycle - 0.690085324 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.587479e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.589999e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.589999e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.701778 sec - 1,356,865,477 cycles # 1.924 GHz - 2,159,196,207 instructions # 1.59 insn per cycle - 0.705773287 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982958280E-003 -Relative difference = 2.0044092642523172e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 2af18ad9d5..34e9c88382 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:40:36 +DATE: 2024-03-03_14:17:55 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.765595e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.766455e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.766757e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.693781 sec - 5,858,518,501 cycles # 3.029 GHz - 12,487,165,720 instructions # 2.13 insn per cycle - 2.044833380 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.874105e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.877606e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.877630e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.256794e-06 +- 4.775721e-07 ) GeV^-6 +TOTAL : 4.367192 sec + 14,910,823,313 cycles:u # 3.399 GHz (75.04%) + 2,801,107 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.06%) + 5,541,591 stalled-cycles-backend:u # 0.04% backend cycles idle (74.97%) + 12,185,699,999 instructions:u # 0.82 insn per cycle + # 0.00 stalled cycles per insn (74.91%) + 4.413931935 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.312075e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.312852e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.312969e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.933893 sec - 6,737,061,424 cycles # 3.047 GHz - 14,801,104,127 instructions # 2.20 insn per cycle - 2.267780802 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.363810e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.380339e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.380485e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.930014e-02 +- 1.363569e-02 ) GeV^-6 +TOTAL : 4.641177 sec + 15,913,862,306 cycles:u # 3.415 GHz (74.94%) + 2,911,751 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.94%) + 5,938,296 stalled-cycles-backend:u # 0.04% backend cycles idle (74.94%) + 12,907,264,996 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 4.684794214 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.922433e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.922702e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.922702e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.925615 sec - 17,989,215,363 cycles # 3.036 GHz - 53,579,777,630 instructions # 2.98 insn per cycle - 5.931642569 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.103979e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.104009e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.104009e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 4.784039 sec + 16,830,780,157 cycles:u # 3.502 GHz (74.95%) + 16,228,839 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.02%) + 1,752,024,514 stalled-cycles-backend:u # 10.41% backend cycles idle (75.03%) + 51,771,030,949 instructions:u # 3.08 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 4.808440063 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087582491E-003 -Relative difference = 2.1198118933954545e-08 +Avg ME (F77/C++) = 9.8479612087396841E-003 +Relative difference = 2.119623377106246e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.564689e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.565144e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.565144e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.490731 sec - 4,558,556,123 cycles # 3.055 GHz - 13,757,084,226 instructions # 3.02 insn per cycle - 1.501811120 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.594743e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.595274e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.595274e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.150569 sec + 4,073,477,842 cycles:u # 3.474 GHz (74.76%) + 700,890 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.76%) + 412,825,647 stalled-cycles-backend:u # 10.13% backend cycles idle (74.82%) + 13,787,072,091 instructions:u # 3.38 insn per cycle + # 0.03 stalled cycles per insn (75.16%) + 1.176252412 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896225560E-003 -Relative difference = 3.151694379513441e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.177084e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.178836e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.178836e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.743943 sec - 2,139,817,263 cycles # 2.875 GHz - 4,819,936,629 instructions # 2.25 insn per cycle - 0.755587883 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070967E-003 -Relative difference = 1.8588234562202478e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.046827e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.047098e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.047098e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.505708 sec + 1,825,688,224 cycles:u # 3.459 GHz (74.31%) + 651,926 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.24%) + 162,825,670 stalled-cycles-backend:u # 8.92% backend cycles idle (74.49%) + 4,891,061,214 instructions:u # 2.68 insn per cycle + # 0.03 stalled cycles per insn (75.26%) + 0.531282824 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84775) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.229829e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.232369e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.232369e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.647666 sec - 1,869,906,105 cycles # 2.875 GHz - 4,276,791,956 instructions # 2.29 insn per cycle - 0.664053491 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070967E-003 -Relative difference = 1.8588234562202478e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.437378e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.439646e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.439646e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.718650 sec - 1,366,457,842 cycles # 1.901 GHz - 2,166,062,692 instructions # 1.59 insn per cycle - 0.731356674 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3474) (512y: 34) (512z:79492) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982955140E-003 -Relative difference = 2.0044060904369713e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index c639834643..c232f7de62 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:41:23 +DATE: 2024-03-03_14:18:58 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.691286e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.691795e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.691928e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.198692 sec - 7,604,134,018 cycles # 3.054 GHz - 16,321,512,266 instructions # 2.15 insn per cycle - 2.594812497 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.742746e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.747735e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.747827e+01 ) sec^-1 +MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 +TOTAL : 9.591909 sec + 33,228,560,743 cycles:u # 3.457 GHz (74.98%) + 3,697,395 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 8,571,213 stalled-cycles-backend:u # 0.03% backend cycles idle (75.01%) + 26,208,740,975 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 9.639434606 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.112457e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112776e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112803e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.397194 sec - 11,475,121,938 cycles # 3.084 GHz - 26,000,925,285 instructions # 2.27 insn per cycle - 3.777191130 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.300447e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.303532e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.303565e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 +TOTAL : 9.331694 sec + 32,295,291,917 cycles:u # 3.453 GHz (75.00%) + 3,442,980 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 6,323,865 stalled-cycles-backend:u # 0.02% backend cycles idle (74.98%) + 25,512,801,065 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 9.377328680 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/CUDA) = 9.8722599015656533E-003 +Relative difference = 3.138524921691728e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.034566e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.034790e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.034790e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.578920 sec - 19,096,747,933 cycles # 2.903 GHz - 54,154,360,803 instructions # 2.84 insn per cycle - 6.585797711 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.028268e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.028295e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.028295e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.136405 sec + 18,072,585,799 cycles:u # 3.504 GHz (74.94%) + 24,128,528 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.01%) + 2,188,182,117 stalled-cycles-backend:u # 12.11% backend cycles idle (75.03%) + 55,394,719,073 instructions:u # 3.07 insn per cycle + # 0.04 stalled cycles per insn (75.03%) + 5.160896024 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634173e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.634271e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.634271e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.239396 sec - 9,369,032,238 cycles # 2.892 GHz - 26,160,172,444 instructions # 2.79 insn per cycle - 3.251135271 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.356983e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.357122e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.357122e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.242013 sec + 7,911,518,876 cycles:u # 3.494 GHz (74.93%) + 1,746,084 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) + 847,402,982 stalled-cycles-backend:u # 10.71% backend cycles idle (74.92%) + 25,904,295,421 instructions:u # 3.27 insn per cycle + # 0.03 stalled cycles per insn (74.92%) + 2.267909527 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.697087e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.697545e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.697545e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.438333 sec - 4,079,178,507 cycles # 2.840 GHz - 9,228,646,226 instructions # 2.26 insn per cycle - 1.450605350 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.482063e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.482807e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.482807e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 0.964815 sec + 3,422,437,458 cycles:u # 3.468 GHz (74.87%) + 617,421 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.87%) + 268,378,370 stalled-cycles-backend:u # 7.84% backend cycles idle (74.88%) + 9,118,206,141 instructions:u # 2.66 insn per cycle + # 0.03 stalled cycles per insn (74.90%) + 0.990276746 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83802) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.363646e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.364393e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.364393e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.218747 sec - 3,509,445,956 cycles # 2.879 GHz - 8,176,263,750 instructions # 2.33 insn per cycle - 1.230057623 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.850358e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.851005e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.851005e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.381042 sec - 2,620,845,167 cycles # 1.898 GHz - 4,155,618,865 instructions # 1.59 insn per cycle - 1.395419124 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index ace04f97d7..d509869f7f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,224 +1,185 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:42:25 +DATE: 2024-03-03_14:20:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.691636e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.692217e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.692361e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.171682 sec - 7,616,890,265 cycles # 3.058 GHz - 16,356,089,453 instructions # 2.15 insn per cycle - 2.553555988 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.803581e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.809144e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.809214e+01 ) sec^-1 +MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 +TOTAL : 9.513476 sec + 32,910,544,812 cycles:u # 3.452 GHz (75.02%) + 3,566,297 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.05%) + 6,736,704 stalled-cycles-backend:u # 0.02% backend cycles idle (75.07%) + 26,011,552,309 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 9.562585571 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.106871e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.107188e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107217e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.406322 sec - 11,260,210,288 cycles # 3.017 GHz - 25,906,087,343 instructions # 2.30 insn per cycle - 3.788413520 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.335565e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.338667e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.338703e+03 ) sec^-1 +MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 +TOTAL : 9.255352 sec + 32,027,497,769 cycles:u # 3.454 GHz (75.00%) + 3,680,692 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 6,546,956 stalled-cycles-backend:u # 0.02% backend cycles idle (74.98%) + 25,314,693,014 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (74.97%) + 9.300974978 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/CUDA) = 9.8722599015656533E-003 +Relative difference = 3.138524921691728e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.951672e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.951882e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.951882e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.644473 sec - 19,262,229,911 cycles # 2.898 GHz - 54,152,472,780 instructions # 2.81 insn per cycle - 6.648593616 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.019957e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.019984e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.019984e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.178293 sec + 18,209,225,630 cycles:u # 3.502 GHz (74.93%) + 33,008,449 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.93%) + 2,203,899,310 stalled-cycles-backend:u # 12.10% backend cycles idle (74.96%) + 55,456,506,347 instructions:u # 3.05 insn per cycle + # 0.04 stalled cycles per insn (75.03%) + 5.202792696 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.623003e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.623092e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.623092e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.257928 sec - 9,349,757,536 cycles # 2.867 GHz - 26,077,919,393 instructions # 2.79 insn per cycle - 3.270643449 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.351772e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.351921e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.351921e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.246515 sec + 7,913,454,949 cycles:u # 3.488 GHz (74.97%) + 1,078,644 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 769,038,758 stalled-cycles-backend:u # 9.72% backend cycles idle (74.97%) + 25,807,143,781 instructions:u # 3.26 insn per cycle + # 0.03 stalled cycles per insn (74.97%) + 2.272377888 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.760154e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.760626e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.760626e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.408906 sec - 4,059,558,991 cycles # 2.874 GHz - 9,213,876,384 instructions # 2.27 insn per cycle - 1.420092908 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.540730e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.541492e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.541492e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 0.954631 sec + 3,385,956,590 cycles:u # 3.466 GHz (74.62%) + 666,916 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.67%) + 307,339,922 stalled-cycles-backend:u # 9.08% backend cycles idle (75.08%) + 9,093,841,869 instructions:u # 2.69 insn per cycle + # 0.03 stalled cycles per insn (75.43%) + 0.980198317 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83360) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.304001e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.304638e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.304638e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.231479 sec - 3,558,951,872 cycles # 2.881 GHz - 8,168,148,330 instructions # 2.30 insn per cycle - 1.241837128 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.836982e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.837574e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.837574e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.381601 sec - 2,619,896,392 cycles # 1.892 GHz - 4,153,497,129 instructions # 1.59 insn per cycle - 1.390536918 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 4f705cbffa..be5aca464a 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,224 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:35:57 +DATE: 2024-03-03_14:12:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.695225e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.365990e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.743234e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.446213 sec - 1,972,017,701 cycles # 2.992 GHz - 2,778,256,208 instructions # 1.41 insn per cycle - 0.734930275 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 695,623,402 cycles:u # 2.059 GHz (77.12%) + 2,296,681 stalled-cycles-frontend:u # 0.33% frontend cycles idle (73.00%) + 5,536,743 stalled-cycles-backend:u # 0.80% backend cycles idle (69.67%) + 1,262,080,978 instructions:u # 1.81 insn per cycle + # 0.00 stalled cycles per insn (73.44%) + 0.384767862 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.267244e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.134450e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.554945e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.528224 sec - 2,304,762,750 cycles # 3.008 GHz - 3,294,040,641 instructions # 1.43 insn per cycle - 0.823439197 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 916,816,301 cycles:u # 2.085 GHz (74.02%) + 2,147,831 stalled-cycles-frontend:u # 0.23% frontend cycles idle (72.75%) + 5,227,451 stalled-cycles-backend:u # 0.57% backend cycles idle (74.90%) + 1,320,642,522 instructions:u # 1.44 insn per cycle + # 0.00 stalled cycles per insn (77.11%) + 0.466448613 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6911ee0) on address 0x14da00f7a000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x14dc962e3dbf in ??? +#1 0x14dc962e3d2b in ??? +#2 0x14dc962e53e4 in ??? +#3 0x14dc8e7b6b64 in ??? +#4 0x14dc8e7b3b38 in ??? +#5 0x14dc8e771496 in ??? +#6 0x14dc9627d6e9 in ??? +#7 0x14dc963b149e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.091452e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.114280e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.114280e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.522856 sec - 4,703,604,569 cycles # 3.081 GHz - 13,462,460,024 instructions # 2.86 insn per cycle - 1.529442917 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.951069e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.025448e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.025448e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.861454 sec - 2,622,516,081 cycles # 3.029 GHz - 7,553,226,055 instructions # 2.88 insn per cycle - 0.875162721 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.378326e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.598362e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.598362e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.506903 sec - 1,479,878,074 cycles # 2.896 GHz - 3,120,545,502 instructions # 2.11 insn per cycle - 0.521612120 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.763846e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.033394e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.033394e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.456990 sec - 1,342,026,946 cycles # 2.909 GHz - 2,982,806,139 instructions # 2.22 insn per cycle - 0.473253864 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +EvtsPerSec[Rmb+ME] (23) = ( 1.178542e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.198431e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.198431e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.413844 sec + 4,996,522,481 cycles:u # 3.478 GHz (74.95%) + 2,383,415 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.95%) + 850,537,335 stalled-cycles-backend:u # 17.02% backend cycles idle (74.95%) + 13,818,020,085 instructions:u # 2.77 insn per cycle + # 0.06 stalled cycles per insn (74.96%) + 1.439637830 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.552530e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.674072e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.674072e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.665523 sec - 1,326,336,546 cycles # 1.981 GHz - 1,954,248,677 instructions # 1.47 insn per cycle - 0.676015017 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x6466a0) on address 0x1511a565a000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 7838899130..9949de34d4 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,241 +1,119 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_03:15:54 +DATE: 2024-03-03_14:54:13 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.566228e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.132243e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.132243e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.471075 sec - 2,051,009,542 cycles # 3.009 GHz - 3,055,349,974 instructions # 1.49 insn per cycle - 0.738770181 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 769,805,366 cycles:u # 2.194 GHz (75.09%) + 3,066,140 stalled-cycles-frontend:u # 0.40% frontend cycles idle (75.19%) + 22,207,789 stalled-cycles-backend:u # 2.88% backend cycles idle (75.94%) + 1,218,976,716 instructions:u # 1.58 insn per cycle + # 0.02 stalled cycles per insn (76.01%) + 0.377671175 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.288005e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.253544e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.253544e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.748132 sec - 3,046,262,026 cycles # 3.023 GHz - 4,636,082,832 instructions # 1.52 insn per cycle - 1.065675268 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 3,001,509,066 cycles:u # 2.758 GHz (75.33%) + 30,405,025 stalled-cycles-frontend:u # 1.01% frontend cycles idle (75.18%) + 855,704,628 stalled-cycles-backend:u # 28.51% backend cycles idle (74.99%) + 3,253,779,151 instructions:u # 1.08 insn per cycle + # 0.26 stalled cycles per insn (74.65%) + 1.115519804 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6911ee0) on address 0x14f2f231a000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x14f587689dbf in ??? +#1 0x14f587689d2b in ??? +#2 0x14f58768b3e4 in ??? +#3 0x14f57fb5cb64 in ??? +#4 0x14f57fb59b38 in ??? +#5 0x14f57fb17496 in ??? +#6 0x14f5876236e9 in ??? +#7 0x14f58775749e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.089966e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112868e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112868e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.529900 sec - 4,728,814,715 cycles # 3.083 GHz - 13,467,526,764 instructions # 2.85 insn per cycle - 1.534252544 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.180080e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200126e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.200126e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.416017 sec + 4,992,511,017 cycles:u # 3.469 GHz (74.99%) + 1,981,947 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.99%) + 839,238,185 stalled-cycles-backend:u # 16.81% backend cycles idle (74.99%) + 13,804,114,722 instructions:u # 2.76 insn per cycle + # 0.06 stalled cycles per insn (75.00%) + 1.441472413 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.949285e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.024056e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.024056e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.869004 sec - 2,652,875,861 cycles # 3.039 GHz - 7,602,145,003 instructions # 2.87 insn per cycle - 0.873736497 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.146841e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.351542e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.351542e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.550316 sec - 1,514,222,662 cycles # 2.732 GHz - 3,170,467,422 instructions # 2.09 insn per cycle - 0.554802806 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.650572e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.918840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.918840e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.478096 sec - 1,374,122,120 cycles # 2.850 GHz - 3,032,631,270 instructions # 2.21 insn per cycle - 0.482825268 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.537453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.662993e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.662993e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.675099 sec - 1,354,490,621 cycles # 1.996 GHz - 1,991,409,834 instructions # 1.47 insn per cycle - 0.679620955 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x6466a0) on address 0x1478f912a000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 1de3a7df55..8450f3c38f 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,224 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:36:15 +DATE: 2024-03-03_14:12:22 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.634258e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.200936e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.553712e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.443315 sec - 2,012,981,464 cycles # 3.013 GHz - 2,802,025,362 instructions # 1.39 insn per cycle - 0.744859677 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault + 726,970,759 cycles:u # 2.147 GHz (75.40%) + 2,277,890 stalled-cycles-frontend:u # 0.31% frontend cycles idle (74.87%) + 4,777,867 stalled-cycles-backend:u # 0.66% backend cycles idle (73.78%) + 1,295,834,284 instructions:u # 1.78 insn per cycle + # 0.00 stalled cycles per insn (70.60%) + 0.369418155 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.239420e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.026633e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.428795e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.526694 sec - 2,300,725,267 cycles # 3.007 GHz - 3,244,738,845 instructions # 1.41 insn per cycle - 0.822736768 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault + 879,697,664 cycles:u # 2.004 GHz (77.05%) + 2,139,401 stalled-cycles-frontend:u # 0.24% frontend cycles idle (76.33%) + 5,059,191 stalled-cycles-backend:u # 0.58% backend cycles idle (76.36%) + 1,322,149,268 instructions:u # 1.50 insn per cycle + # 0.00 stalled cycles per insn (75.51%) + 0.482825554 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6911e30) on address 0x154da8629000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x15503d999dbf in ??? +#1 0x15503d999d2b in ??? +#2 0x15503d99b3e4 in ??? +#3 0x155035e6cb64 in ??? +#4 0x155035e69b38 in ??? +#5 0x155035e27496 in ??? +#6 0x15503d9336e9 in ??? +#7 0x15503da6749e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.093034e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.115683e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.115683e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.520645 sec - 4,710,102,553 cycles # 3.090 GHz - 13,456,334,828 instructions # 2.86 insn per cycle - 1.527404362 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.995699e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.070809e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.070809e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.841713 sec - 2,618,818,041 cycles # 3.096 GHz - 7,552,217,415 instructions # 2.88 insn per cycle - 0.854217946 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.378534e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.594400e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.594400e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.506766 sec - 1,482,977,233 cycles # 2.909 GHz - 3,119,381,568 instructions # 2.10 insn per cycle - 0.519705447 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.757237e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.033602e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.033602e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.457488 sec - 1,337,095,985 cycles # 2.896 GHz - 2,979,946,273 instructions # 2.23 insn per cycle - 0.473330982 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +EvtsPerSec[Rmb+ME] (23) = ( 1.182610e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.202714e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.202714e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.408665 sec + 4,975,987,017 cycles:u # 3.477 GHz (74.85%) + 1,841,925 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.85%) + 659,857,967 stalled-cycles-backend:u # 13.26% backend cycles idle (74.85%) + 13,849,894,286 instructions:u # 2.78 insn per cycle + # 0.05 stalled cycles per insn (74.90%) + 1.433203022 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.547680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.672650e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.672650e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.666550 sec - 1,326,556,264 cycles # 1.978 GHz - 1,952,513,162 instructions # 1.47 insn per cycle - 0.681133765 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x66d340) on address 0x14a1af479000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 4d40239a82..9d0c590c91 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,224 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:36:34 +DATE: 2024-03-03_14:12:37 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.367019e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.211392e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.351303e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.439896 sec - 1,919,384,660 cycles # 2.928 GHz - 2,652,462,812 instructions # 1.38 insn per cycle - 0.728915663 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 718,932,282 cycles:u # 2.158 GHz (73.06%) + 2,292,913 stalled-cycles-frontend:u # 0.32% frontend cycles idle (71.35%) + 5,485,037 stalled-cycles-backend:u # 0.76% backend cycles idle (73.12%) + 1,229,882,023 instructions:u # 1.71 insn per cycle + # 0.00 stalled cycles per insn (75.81%) + 0.361196020 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.249516e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.812359e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.959123e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.476459 sec - 2,111,535,021 cycles # 3.010 GHz - 2,984,192,787 instructions # 1.41 insn per cycle - 0.759063881 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 778,058,565 cycles:u # 2.038 GHz (75.01%) + 2,296,658 stalled-cycles-frontend:u # 0.30% frontend cycles idle (72.63%) + 5,017,192 stalled-cycles-backend:u # 0.64% backend cycles idle (74.44%) + 1,258,146,385 instructions:u # 1.62 insn per cycle + # 0.00 stalled cycles per insn (76.74%) + 0.409848855 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x690ff00) on address 0x1492f1885000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x149586befdbf in ??? +#1 0x149586befd2b in ??? +#2 0x149586bf13e4 in ??? +#3 0x14957f0c2b64 in ??? +#4 0x14957f0bfb38 in ??? +#5 0x14957f07d496 in ??? +#6 0x149586b896e9 in ??? +#7 0x149586cbd49e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.158503e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.184413e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.184413e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.434431 sec - 4,452,862,887 cycles # 3.097 GHz - 13,047,773,125 instructions # 2.93 insn per cycle - 1.440725517 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.101216e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.298192e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.298192e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.547840 sec - 1,698,684,785 cycles # 3.077 GHz - 4,513,142,797 instructions # 2.66 insn per cycle - 0.560862800 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.089458e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.856206e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.856206e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.289099 sec - 853,788,001 cycles # 2.912 GHz - 1,897,231,072 instructions # 2.22 insn per cycle - 0.300313484 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.510175e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.400201e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.400201e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.271830 sec - 801,479,133 cycles # 2.904 GHz - 1,820,357,988 instructions # 2.27 insn per cycle - 0.285846070 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +EvtsPerSec[Rmb+ME] (23) = ( 1.430784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.461040e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.461040e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.166312 sec + 4,125,984,555 cycles:u # 3.473 GHz (75.09%) + 2,301,196 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.11%) + 315,086,834 stalled-cycles-backend:u # 7.64% backend cycles idle (75.11%) + 12,630,321,325 instructions:u # 3.06 insn per cycle + # 0.02 stalled cycles per insn (75.10%) + 1.190560148 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.997156e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.506085e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.506085e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.349567 sec - 731,841,700 cycles # 2.069 GHz - 1,305,336,291 instructions # 1.78 insn per cycle - 0.359850888 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x6242c0) on address 0x14ed3f78d000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 441da29ffb..3b6b47fdec 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,241 +1,119 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_03:16:12 +DATE: 2024-03-03_14:54:28 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.711602e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.109045e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.109045e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.448633 sec - 2,014,530,108 cycles # 3.024 GHz - 2,953,646,670 instructions # 1.47 insn per cycle - 0.724573840 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 728,018,820 cycles:u # 2.102 GHz (74.62%) + 2,828,733 stalled-cycles-frontend:u # 0.39% frontend cycles idle (76.61%) + 37,815,477 stalled-cycles-backend:u # 5.19% backend cycles idle (77.98%) + 1,243,724,290 instructions:u # 1.71 insn per cycle + # 0.03 stalled cycles per insn (75.72%) + 0.373758611 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.194631e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.629307e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.629307e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.616658 sec - 2,563,348,424 cycles # 3.027 GHz - 3,871,269,369 instructions # 1.51 insn per cycle - 0.904047137 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 2,913,829,385 cycles:u # 2.886 GHz (73.84%) + 30,498,351 stalled-cycles-frontend:u # 1.05% frontend cycles idle (74.73%) + 847,633,350 stalled-cycles-backend:u # 29.09% backend cycles idle (75.76%) + 3,091,197,240 instructions:u # 1.06 insn per cycle + # 0.27 stalled cycles per insn (75.41%) + 1.034770890 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x690ff00) on address 0x14d87d5ed000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x14db12952dbf in ??? +#1 0x14db12952d2b in ??? +#2 0x14db129543e4 in ??? +#3 0x14db0ae25b64 in ??? +#4 0x14db0ae22b38 in ??? +#5 0x14db0ade0496 in ??? +#6 0x14db128ec6e9 in ??? +#7 0x14db12a2049e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.161555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.188116e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.188116e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.433803 sec - 4,469,694,345 cycles # 3.110 GHz - 13,052,094,019 instructions # 2.92 insn per cycle - 1.437926738 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.433588e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.463885e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.463885e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.166373 sec + 4,145,866,751 cycles:u # 3.486 GHz (74.65%) + 2,407,100 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.92%) + 316,076,995 stalled-cycles-backend:u # 7.62% backend cycles idle (75.12%) + 12,640,812,453 instructions:u # 3.05 insn per cycle + # 0.03 stalled cycles per insn (75.12%) + 1.191670420 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.090515e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.286507e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.286507e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.554057 sec - 1,716,801,013 cycles # 3.079 GHz - 4,560,314,564 instructions # 2.66 insn per cycle - 0.558193661 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.984424e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.738205e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.738205e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.297621 sec - 872,015,724 cycles # 2.894 GHz - 1,933,356,220 instructions # 2.22 insn per cycle - 0.301984624 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.471182e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.343667e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.343667e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.276934 sec - 818,470,682 cycles # 2.917 GHz - 1,856,220,484 instructions # 2.27 insn per cycle - 0.281151541 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.926101e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.412906e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.412906e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.358667 sec - 751,185,964 cycles # 2.073 GHz - 1,346,032,296 instructions # 1.79 insn per cycle - 0.362975431 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x6242c0) on address 0x154ddd27d000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 8918bec5c8..9f2052d970 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,224 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:36:50 +DATE: 2024-03-03_14:12:50 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.307953e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.201255e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.336658e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.436130 sec - 1,959,442,257 cycles # 3.009 GHz - 2,743,667,126 instructions # 1.40 insn per cycle - 0.720037686 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault + 718,430,242 cycles:u # 2.167 GHz (75.24%) + 2,158,370 stalled-cycles-frontend:u # 0.30% frontend cycles idle (76.38%) + 5,048,832 stalled-cycles-backend:u # 0.70% backend cycles idle (74.59%) + 1,291,858,340 instructions:u # 1.80 insn per cycle + # 0.00 stalled cycles per insn (71.10%) + 0.365143940 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.165076e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.782519e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.922757e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.476114 sec - 2,116,952,174 cycles # 3.025 GHz - 3,000,364,507 instructions # 1.42 insn per cycle - 0.758577490 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault + 806,118,878 cycles:u # 2.070 GHz (75.37%) + 2,154,811 stalled-cycles-frontend:u # 0.27% frontend cycles idle (75.52%) + 4,443,268 stalled-cycles-backend:u # 0.55% backend cycles idle (75.93%) + 1,240,070,438 instructions:u # 1.54 insn per cycle + # 0.00 stalled cycles per insn (76.95%) + 0.418220120 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x690fe50) on address 0x150d87cec000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x15101d054dbf in ??? +#1 0x15101d054d2b in ??? +#2 0x15101d0563e4 in ??? +#3 0x151015527b64 in ??? +#4 0x151015524b38 in ??? +#5 0x1510154e2496 in ??? +#6 0x15101cfee6e9 in ??? +#7 0x15101d12249e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.155211e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.181167e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.181167e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.438010 sec - 4,446,707,539 cycles # 3.084 GHz - 13,028,651,848 instructions # 2.93 insn per cycle - 1.444314220 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.098425e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.294299e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.294299e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.547784 sec - 1,696,823,876 cycles # 3.074 GHz - 4,509,092,353 instructions # 2.66 insn per cycle - 0.559046282 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.019219e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.763141e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.763141e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.292180 sec - 859,590,330 cycles # 2.901 GHz - 1,893,994,453 instructions # 2.20 insn per cycle - 0.304986924 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.549494e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.438482e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.438482e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.269638 sec - 798,515,936 cycles # 2.915 GHz - 1,816,168,831 instructions # 2.27 insn per cycle - 0.281600896 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +EvtsPerSec[Rmb+ME] (23) = ( 1.421570e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.451417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.451417e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.173486 sec + 4,157,347,370 cycles:u # 3.477 GHz (74.63%) + 2,500,400 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.82%) + 625,232,992 stalled-cycles-backend:u # 15.04% backend cycles idle (75.10%) + 12,626,682,513 instructions:u # 3.04 insn per cycle + # 0.05 stalled cycles per insn (75.25%) + 1.197800077 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.914139e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.405725e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.405725e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.355005 sec - 734,840,966 cycles # 2.046 GHz - 1,303,017,912 instructions # 1.77 insn per cycle - 0.365594980 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 32) (512z: 2383) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x621ef0) on address 0x14eb4effc000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 9473075c44..6681303993 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,224 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:37:07 +DATE: 2024-03-03_14:13:04 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.657865e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.342545e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.715127e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444064 sec - 2,011,501,510 cycles # 2.996 GHz - 2,813,725,950 instructions # 1.40 insn per cycle - 0.745188123 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault + 680,746,734 cycles:u # 1.998 GHz (74.40%) + 2,192,175 stalled-cycles-frontend:u # 0.32% frontend cycles idle (70.65%) + 5,227,388 stalled-cycles-backend:u # 0.77% backend cycles idle (73.67%) + 1,163,912,158 instructions:u # 1.71 insn per cycle + # 0.00 stalled cycles per insn (77.36%) + 0.370007528 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.264913e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.129230e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.558122e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.531362 sec - 2,289,898,203 cycles # 2.976 GHz - 3,193,334,828 instructions # 1.39 insn per cycle - 0.827090728 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault + 932,712,212 cycles:u # 2.128 GHz (72.21%) + 2,151,709 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.43%) + 4,978,401 stalled-cycles-backend:u # 0.53% backend cycles idle (77.14%) + 1,373,314,207 instructions:u # 1.47 insn per cycle + # 0.00 stalled cycles per insn (76.46%) + 0.466969143 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6911ee0) on address 0x14cc9bf9a000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x14cf312fedbf in ??? +#1 0x14cf312fed2b in ??? +#2 0x14cf313003e4 in ??? +#3 0x14cf297d1b64 in ??? +#4 0x14cf297ceb38 in ??? +#5 0x14cf2978c496 in ??? +#6 0x14cf312986e9 in ??? +#7 0x14cf313cc49e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.087550e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.110443e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.110443e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.528426 sec - 4,733,772,591 cycles # 3.090 GHz - 13,465,129,433 instructions # 2.84 insn per cycle - 1.534888113 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.994397e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.071792e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.071792e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.843067 sec - 2,603,799,246 cycles # 3.073 GHz - 7,385,481,301 instructions # 2.84 insn per cycle - 0.853727039 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.410870e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.639370e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.639370e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.502006 sec - 1,465,753,503 cycles # 2.896 GHz - 3,056,435,528 instructions # 2.09 insn per cycle - 0.511483566 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.873726e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164501e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.164501e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444397 sec - 1,302,869,174 cycles # 2.905 GHz - 2,931,108,724 instructions # 2.25 insn per cycle - 0.456529729 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +EvtsPerSec[Rmb+ME] (23) = ( 1.180041e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200018e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.200018e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.411883 sec + 4,987,990,687 cycles:u # 3.478 GHz (74.90%) + 1,727,890 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.90%) + 898,353,870 stalled-cycles-backend:u # 18.01% backend cycles idle (74.90%) + 13,843,424,201 instructions:u # 2.78 insn per cycle + # 0.06 stalled cycles per insn (74.91%) + 1.436362073 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.488835e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.605728e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.605728e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.681918 sec - 1,362,782,748 cycles # 1.986 GHz - 1,970,355,079 instructions # 1.45 insn per cycle - 0.693685126 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x6466a0) on address 0x14f37779a000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index f04f8628ac..f0f62cc1da 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,224 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand -HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasNoCurand +HASHIPRAND=hasHiprand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:37:24 +DATE: 2024-03-03_14:13:18 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.658641e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.216275e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.578681e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.445224 sec - 1,992,469,002 cycles # 2.992 GHz - 2,813,148,728 instructions # 1.41 insn per cycle - 0.736789901 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault + 705,255,530 cycles:u # 2.120 GHz (73.74%) + 2,090,693 stalled-cycles-frontend:u # 0.30% frontend cycles idle (76.82%) + 4,796,050 stalled-cycles-backend:u # 0.68% backend cycles idle (75.80%) + 1,191,491,523 instructions:u # 1.69 insn per cycle + # 0.00 stalled cycles per insn (76.03%) + 0.361944732 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.263173e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.989199e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.385950e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.532147 sec - 2,297,521,664 cycles # 2.990 GHz - 3,210,517,070 instructions # 1.40 insn per cycle - 0.827894226 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault + 898,559,935 cycles:u # 2.067 GHz (74.78%) + 2,139,128 stalled-cycles-frontend:u # 0.24% frontend cycles idle (76.78%) + 5,276,888 stalled-cycles-backend:u # 0.59% backend cycles idle (76.21%) + 1,369,752,031 instructions:u # 1.52 insn per cycle + # 0.00 stalled cycles per insn (76.12%) + 0.464817968 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6911e30) on address 0x14c05cea9000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x14c2f221adbf in ??? +#1 0x14c2f221ad2b in ??? +#2 0x14c2f221c3e4 in ??? +#3 0x14c2ea6edb64 in ??? +#4 0x14c2ea6eab38 in ??? +#5 0x14c2ea6a8496 in ??? +#6 0x14c2f21b46e9 in ??? +#7 0x14c2f22e849e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.091329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.113996e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.113996e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.523445 sec - 4,724,741,346 cycles # 3.094 GHz - 13,451,257,746 instructions # 2.85 insn per cycle - 1.529633779 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.010329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.087455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.087455e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.835617 sec - 2,595,186,002 cycles # 3.089 GHz - 7,389,201,553 instructions # 2.85 insn per cycle - 0.854907608 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.399802e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.624427e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.624427e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.503119 sec - 1,466,604,979 cycles # 2.890 GHz - 3,056,260,975 instructions # 2.08 insn per cycle - 0.515296062 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.762321e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.040429e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.040429e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.457389 sec - 1,310,592,019 cycles # 2.838 GHz - 2,931,897,706 instructions # 2.24 insn per cycle - 0.469608344 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +EvtsPerSec[Rmb+ME] (23) = ( 1.167118e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.186677e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186677e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.427028 sec + 5,051,858,997 cycles:u # 3.486 GHz (74.65%) + 1,906,701 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.92%) + 961,495,185 stalled-cycles-backend:u # 19.03% backend cycles idle (75.16%) + 13,852,010,764 instructions:u # 2.74 insn per cycle + # 0.07 stalled cycles per insn (75.17%) + 1.451501973 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.462138e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.577756e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.577756e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.689340 sec - 1,364,202,689 cycles # 1.967 GHz - 1,970,285,028 instructions # 1.44 insn per cycle - 0.699058633 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x66d340) on address 0x15362ed29000. Reason: Unknown. From a77fb343aa063d758c0a3c585741977fbd74a33b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 19:45:44 +0200 Subject: [PATCH 88/96] [susy2] rerun 18 tmad tests on LUMI, all ok (execpt for gqttq as usual) (1) all tests but ggttggg STARTED AT Sun 03 Mar 2024 04:32:31 PM EET ENDED AT Sun 03 Mar 2024 05:04:29 PM EET (2) ggttggg tests only STARTED AT Sun 03 Mar 2024 04:53:00 PM EET ENDED AT Sun 03 Mar 2024 07:28:33 PM EET Status=0 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt 12 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt 12 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt 12 /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt --- .../log_eemumu_mad_d_inl0_hrd0.txt | 412 +++++----------- .../log_eemumu_mad_f_inl0_hrd0.txt | 424 ++++++----------- .../log_eemumu_mad_m_inl0_hrd0.txt | 402 +++++----------- .../log_ggtt_mad_d_inl0_hrd0.txt | 412 +++++----------- .../log_ggtt_mad_f_inl0_hrd0.txt | 420 ++++++---------- .../log_ggtt_mad_m_inl0_hrd0.txt | 404 +++++----------- .../log_ggttg_mad_d_inl0_hrd0.txt | 430 ++++++----------- .../log_ggttg_mad_f_inl0_hrd0.txt | 432 ++++++----------- .../log_ggttg_mad_m_inl0_hrd0.txt | 428 ++++++----------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 432 ++++++----------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 432 ++++++----------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 432 ++++++----------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 426 ++++++----------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 420 ++++++---------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 420 ++++++---------- .../log_gqttq_mad_d_inl0_hrd0.txt | 445 +++++------------ .../log_gqttq_mad_f_inl0_hrd0.txt | 449 +++++------------- .../log_gqttq_mad_m_inl0_hrd0.txt | 447 +++++------------ 18 files changed, 2377 insertions(+), 5290 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index fb2022a061..7e993f4ca8 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_03:35:28 +DATE: 2024-03-03_16:48:45 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.6832s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6748s - [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5695s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5635s + [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1761s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1679s - [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1360s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1301s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3673s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s - [COUNTERS] Fortran MEs ( 1 ) : 0.0874s for 90112 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2819s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2195s + [COUNTERS] Fortran MEs ( 1 ) : 0.0625s for 90112 events => throughput is 1.44E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1811s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1745s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1629s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1569s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 8192 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661545E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515602020000780E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3805s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3053s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0752s for 90112 events => throughput is 1.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2919s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2266s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0653s for 90112 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000780E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.174335e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.428280e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.235605e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.439855e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1874s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1831s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.89E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1419s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1384s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.32E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661518E-002) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3298s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2854s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0444s for 90112 events => throughput is 2.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2644s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2256s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0387s for 90112 events => throughput is 2.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.003456e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.392008e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.071261e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.453554e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,13 +286,13 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1738s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1708s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.75E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1398s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1374s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.31E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3165s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2833s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 90112 events => throughput is 2.71E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2500s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2227s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0273s for 90112 events => throughput is 3.31E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.590204e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.423627e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.724231e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1747s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1718s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.78E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3170s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2851s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0318s for 90112 events => throughput is 2.83E+06 events/s +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.552803e+06 ) sec^-1 -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.651963e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.888816e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1756s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1721s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3283s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 90112 events => throughput is 2.28E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.333417e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.247580e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5894s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5889s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4804s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4800s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.80E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661545E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,18 +390,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7068s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7019s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5071s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5028s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 90112 events => throughput is 2.07E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -560,43 +412,43 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.143768e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.186318e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.922192e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.600394e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.720542e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.242769e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.434610e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.894046e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.732238e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.247301e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.027929e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.953286e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.748145e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.191475e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.129848e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.549947e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 130936da07..302bb64830 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_03:35:44 +DATE: 2024-03-03_16:49:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7004s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6920s - [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5089s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5030s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1752s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1674s - [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1370s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1311s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3760s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s - [COUNTERS] Fortran MEs ( 1 ) : 0.0872s for 90112 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2181s + [COUNTERS] Fortran MEs ( 1 ) : 0.0624s for 90112 events => throughput is 1.44E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382703205998396E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382701684199335E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1795s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1733s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1451s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1398s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0053s for 8192 events => throughput is 1.54E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382703205998396E-002) differ by less than 4E-4 (1.306308462512007e-07) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382701684199335E-002) differ by less than 4E-4 (1.4692721372888684e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515590123565249E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515588842633111E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3578s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2889s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0690s for 90112 events => throughput is 1.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2812s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2244s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0567s for 90112 events => throughput is 1.59E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515590123565249E-002) differ by less than 4E-4 (1.2999352305698153e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515588842633111E-002) differ by less than 4E-4 (1.439903947186849e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.296058e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.658570e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.289423e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.672811e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382719831741665E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1759s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1734s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1382s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1361s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.84E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700723828302E-002) differ by less than 4E-4 (1.5721146218172777e-07) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382719831741665E-002) differ by less than 4E-4 (4.740791825774693e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515606481761602E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3141s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0281s for 90112 events => throughput is 3.21E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2446s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2212s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 90112 events => throughput is 3.84E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587612890761E-002) differ by less than 4E-4 (1.5742791048545257e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515606481761602E-002) differ by less than 4E-4 (4.875410031246474e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.247103e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.091998e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.346461e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.235774e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382719700521907E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1759s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1735s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.49E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1378s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1361s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.59E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700679354239E-002) differ by less than 4E-4 (1.576877179942926e-07) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382719700521907E-002) differ by less than 4E-4 (4.6002735842876064e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515606480805645E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3181s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2923s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 90112 events => throughput is 3.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2404s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2208s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0196s for 90112 events => throughput is 4.59E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002) differ by less than 4E-4 (1.573566908996682e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515606480805645E-002) differ by less than 4E-4 (4.874365444607065e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.473027e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.779574e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1764s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1741s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700679354239E-002) differ by less than 4E-4 (1.576877179942926e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3134s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2887s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0247s for 90112 events => throughput is 3.65E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002) differ by less than 4E-4 (1.573566908996682e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.393313e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.829243e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.850238e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382704335459282E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1726s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.46E+06 events/s +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.004255e+06 ) sec^-1 -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382704335459282E-002) differ by less than 4E-4 (1.1853587900123586e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515591296252558E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3156s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2901s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0255s for 90112 events => throughput is 3.53E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591296252558E-002) differ by less than 4E-4 (1.1717945325173673e-07) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.340689e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.795181e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382704338101225E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5861s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.74E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4176s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4173s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.62E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382706077425631E-002) differ by less than 4E-4 (9.988182347875352e-08) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382704338101225E-002) differ by less than 4E-4 (1.1850758729892164e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515591361999701E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7024s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 1.98E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5010s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4980s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 90112 events => throughput is 3.02E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515592892887687E-002) differ by less than 4E-4 (9.973286385633884e-08) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591361999701E-002) differ by less than 4E-4 (1.1646102771045719e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.528794e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.783310e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.178202e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.366509e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.848804e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.344857e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.051133e+09 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.689472e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.014035e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.322265e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.222690e+09 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.874276e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.412951e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.118728e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.409232e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.930551e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index da7367ae5e..ef30c48422 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_03:36:01 +DATE: 2024-03-03_16:49:25 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7189s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7106s - [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5406s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5347s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1780s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1693s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1371s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1312s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3702s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2823s - [COUNTERS] Fortran MEs ( 1 ) : 0.0879s for 90112 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2180s + [COUNTERS] Fortran MEs ( 1 ) : 0.0625s for 90112 events => throughput is 1.44E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715420701395E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1808s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1742s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1492s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1432s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701395E-002) differ by less than 2E-4 (1.7176482458580722e-10) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.717646025412023e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3592s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0732s for 90112 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2915s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2259s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0656s for 90112 events => throughput is 1.37E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.182030e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.410977e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.222787e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.429208e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,13 +210,13 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1746s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1707s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.07E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1413s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1379s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.41E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.7176438049659737e-10) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.717646025412023e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3256s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 90112 events => throughput is 2.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2603s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2229s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0374s for 90112 events => throughput is 2.41E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.086150e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.470891e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.131619e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.528271e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,13 +286,13 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1726s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1696s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.76E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1390s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1364s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.21E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484913930753692e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3229s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2892s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0338s for 90112 events => throughput is 2.67E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2498s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2215s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0283s for 90112 events => throughput is 3.18E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.541763e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.332736e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.454900e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1830s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1799s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.61E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.448928e+06 ) sec^-1 -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3167s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2848s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 90112 events => throughput is 2.83E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002) differ by less than 2E-4 (2.947131427788463e-11) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.677035e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.872617e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1756s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1722s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.36E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3270s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2890s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0380s for 90112 events => throughput is 2.37E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002) differ by less than 2E-4 (2.947131427788463e-11) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.248118e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.400436e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715392009222E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5902s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5897s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.66E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4147s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 2.04E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715392009194E-002) differ by less than 2E-4 (1.3548906441229747e-10) +OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715392009222E-002) differ by less than 2E-4 (1.3548862032308762e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,18 +390,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602021089631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7064s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7016s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.87E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5033s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4990s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 90112 events => throughput is 2.08E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -560,43 +412,43 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602021089631E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.153365e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.194063e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.922960e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.598738e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.732117e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.245461e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.451486e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.916171e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.736678e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.280594e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.069247e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.964470e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.733211e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.206983e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.156375e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.552281e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 657075d34f..46de15fd70 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_03:36:18 +DATE: 2024-03-03_16:49:44 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.8052s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7640s - [COUNTERS] Fortran MEs ( 1 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7329s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7043s + [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.86E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3849s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3442s - [COUNTERS] Fortran MEs ( 1 ) : 0.0408s for 8192 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3307s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3021s + [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6297s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1833s - [COUNTERS] Fortran MEs ( 1 ) : 0.4464s for 90112 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3429s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0301s + [COUNTERS] Fortran MEs ( 1 ) : 0.3128s for 90112 events => throughput is 2.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4282s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3910s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0373s for 8192 events => throughput is 2.20E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3727s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3404s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0323s for 8192 events => throughput is 2.53E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6901s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2834s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4066s for 90112 events => throughput is 2.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4231s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0658s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3573s for 90112 events => throughput is 2.52E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989099) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.207121e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.598891e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.224007e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.609518e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756619] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3927s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 8192 events => throughput is 3.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3094s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0177s for 8192 events => throughput is 4.63E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756619) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989085] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4997s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2628s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2369s for 90112 events => throughput is 3.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2463s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0509s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1954s for 90112 events => throughput is 4.61E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989106) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989085) differ by less than 3E-14 (5.551115123125783e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.699229e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.739258e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.772412e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.744593e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3811s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.20E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3155s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3051s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0103s for 8192 events => throughput is 7.94E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4003s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2548s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1455s for 90112 events => throughput is 6.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1533s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0403s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 90112 events => throughput is 7.98E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989114) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.020313e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.234509e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.141769e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.326375e+05 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3737s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3616s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.75E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3804s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2520s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1283s for 90112 events => throughput is 7.02E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.898875e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.924828e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3919s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3726s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 8192 events => throughput is 4.24E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5267s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3034s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2233s for 90112 events => throughput is 4.04E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.791161e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.782832e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,8 +357,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -514,13 +366,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7828s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7823s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5794s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5787s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,8 +390,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -547,9 +399,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6782s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6718s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3240s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3165s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.21E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -560,43 +412,43 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.045663e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.429406e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.714246e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.049159e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.010596e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.777279e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.071675e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.754790e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.000853e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.779709e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.152555e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.941572e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.001515e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.764332e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.100234e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.142528e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index eb011c6697..243b746871 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_03:36:45 +DATE: 2024-03-03_16:50:10 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.7779s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7373s - [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5896s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5611s + [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3888s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3479s - [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3375s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3089s + [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6449s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1962s - [COUNTERS] Fortran MEs ( 1 ) : 0.4487s for 90112 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3431s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0309s + [COUNTERS] Fortran MEs ( 1 ) : 0.3122s for 90112 events => throughput is 2.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094179780921394] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094178241446492] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4205s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0345s for 8192 events => throughput is 2.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3491s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3217s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0274s for 8192 events => throughput is 2.99E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094179780921394) differ by less than 4E-4 (1.0665510541407741e-07) +OK! xsec from fortran (47.094184803756626) and cpp (47.094178241446492) differ by less than 4E-4 (1.3934438314322506e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105688579298537] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105686930681671] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6592s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2787s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3805s for 90112 events => throughput is 2.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3609s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0592s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3017s for 90112 events => throughput is 2.99E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105688579298537) differ by less than 4E-4 (1.4224799227413598e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105686930681671) differ by less than 4E-4 (1.7724624157278157e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.351307e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.115125e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.338637e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.119656e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094175850060040] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094176373190514] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0148s for 8192 events => throughput is 5.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3197s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3069s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.40E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094175850060040) differ by less than 4E-4 (1.9012318908107062e-07) +OK! xsec from fortran (47.094184803756626) and cpp (47.094176373190514) differ by less than 4E-4 (1.7901501314643298e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105684763984058] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105685173093654] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4203s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2581s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1622s for 90112 events => throughput is 5.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1832s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0422s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1410s for 90112 events => throughput is 6.39E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105684763984058) differ by less than 4E-4 (2.2324275217311396e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105685173093654) differ by less than 4E-4 (2.1455782361901043e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.210465e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.830369e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.317035e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.554914e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094174474272364] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3679s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3602s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3070s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3008s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094173652938650) differ by less than 4E-4 (2.3677696170398832e-07) +OK! xsec from fortran (47.094184803756626) and cpp (47.094174474272364) differ by less than 4E-4 (2.1933672500473733e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684585116684] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3368s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2515s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0853s for 90112 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.1062s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0366s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0696s for 90112 events => throughput is 1.29E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ by less than 4E-4 (2.384278946498952e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684585116684) differ by less than 4E-4 (2.2703990176786704e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.038889e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.040818e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3685s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3613s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.13E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094173652938650) differ by less than 4E-4 (2.3677696170398832e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3377s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2584s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0793s for 90112 events => throughput is 1.14E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ by less than 4E-4 (2.384278946498952e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.104729e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.369839e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.124265e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094178213275804] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3726s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3621s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0105s for 8192 events => throughput is 7.77E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094178213275804) differ by less than 4E-4 (1.3994256109484127e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.384682e+06 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105688407939567] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3760s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2629s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 90112 events => throughput is 7.97E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989114) and cpp (47.105688407939567) differ by less than 4E-4 (1.4588574703822133e-07) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.591310e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.407728e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184344050284] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094176770070867] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7795s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7789s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.48E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5810s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5807s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.42E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184344050284) differ by less than 4E-4 (9.761425112664313e-09) +OK! xsec from fortran (47.094184803756626) and cpp (47.094176770070867) differ by less than 4E-4 (1.705876382374072e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105694586476879] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105687115703695] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7005s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6948s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0057s for 90112 events => throughput is 1.57E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3190s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3154s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 90112 events => throughput is 2.50E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105694586476879) differ by less than 4E-4 (1.4722471020078842e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105687115703695) differ by less than 4E-4 (1.733184357144424e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.201563e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.036278e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.986974e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.234773e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.810580e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.078030e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.774762e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.031314e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.802177e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.091165e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.847890e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.108729e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.368745e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.168153e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.422351e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.508893e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index bef66309f6..d4f3b3626b 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_03:37:12 +DATE: 2024-03-03_16:50:35 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.7917s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7505s - [COUNTERS] Fortran MEs ( 1 ) : 0.0413s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5916s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5631s + [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3956s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s - [COUNTERS] Fortran MEs ( 1 ) : 0.0410s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3206s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2920s + [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6496s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1993s - [COUNTERS] Fortran MEs ( 1 ) : 0.4503s for 90112 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3443s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0322s + [COUNTERS] Fortran MEs ( 1 ) : 0.3122s for 90112 events => throughput is 2.89E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,13 +134,13 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4256s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3880s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0377s for 8192 events => throughput is 2.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3604s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3274s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428720952538e-08) +OK! xsec from fortran (47.094184803756626) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428942997143e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6990s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2867s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4123s for 90112 events => throughput is 2.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4306s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0684s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3622s for 90112 events => throughput is 2.49E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006634) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.185122e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.541809e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.177902e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.553175e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186141863908] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3989s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3779s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0210s for 8192 events => throughput is 3.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3293s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3114s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0179s for 8192 events => throughput is 4.58E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428720952538e-08) +OK! xsec from fortran (47.094184803756626) and cpp (47.094186141863908) differ by less than 2E-4 (2.8413429165041748e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5038s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2721s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2317s for 90112 events => throughput is 3.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2460s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0575s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1884s for 90112 events => throughput is 4.78E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006626) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.744718e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.839969e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.796645e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.879227e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186193208834] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3789s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3661s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3118s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3018s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.17E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) +OK! xsec from fortran (47.094184803756626) and cpp (47.094186193208834) differ by less than 2E-4 (2.9503689491505725e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105696667630852] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4021s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2588s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1433s for 90112 events => throughput is 6.29E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1520s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0417s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1103s for 90112 events => throughput is 8.17E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105696667630852) differ by less than 2E-4 (2.9458046002517335e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.012402e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.436276e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.056070e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3770s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3652s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0118s for 8192 events => throughput is 6.94E+05 events/s +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.512312e+05 ) sec^-1 -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3844s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2567s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1278s for 90112 events => throughput is 7.05E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.957699e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.976096e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3910s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3720s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0190s for 8192 events => throughput is 4.32E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4767s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2708s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2059s for 90112 events => throughput is 4.38E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.223304e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.269412e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184798437837] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7843s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7837s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.37E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5801s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5794s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.13E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756640) and cpp (47.094184798437830) differ by less than 2E-4 (1.1293987967064822e-10) +OK! xsec from fortran (47.094184803756626) and cpp (47.094184798437837) differ by less than 2E-4 (1.1293943558143837e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,8 +390,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -547,9 +399,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6837s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6773s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.40E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3264s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3189s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.20E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -560,43 +412,43 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279068492) differ b OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.090244e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.598819e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.672934e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.054804e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.997070e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.803359e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.055834e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.800501e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.991192e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.773858e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.134835e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.012601e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.012024e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.783351e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.999333e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.162857e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index cd3823dd45..1095013ce9 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:37:39 +DATE: 2024-03-03_16:51:01 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6990s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3830s - [COUNTERS] Fortran MEs ( 1 ) : 0.3160s for 8192 events => throughput is 2.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5655s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3630s + [COUNTERS] Fortran MEs ( 1 ) : 0.2025s for 8192 events => throughput is 4.05E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6286s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3119s - [COUNTERS] Fortran MEs ( 1 ) : 0.3167s for 8192 events => throughput is 2.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4596s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2570s + [COUNTERS] Fortran MEs ( 1 ) : 0.2025s for 8192 events => throughput is 4.04E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 4.9846s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4835s - [COUNTERS] Fortran MEs ( 1 ) : 3.5010s for 90112 events => throughput is 2.57E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 3.4323s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2043s + [COUNTERS] Fortran MEs ( 1 ) : 2.2280s for 90112 events => throughput is 4.04E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317668354763] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 1.0129s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6673s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3457s for 8192 events => throughput is 2.37E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8310s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5506s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2804s for 8192 events => throughput is 2.92E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354763) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.3959s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8159s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5801s for 90112 events => throughput is 2.52E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252514E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 4.5869s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5062s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.0808s for 90112 events => throughput is 2.92E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252514E-002) differ by less than 3E-14 (5.551115123125783e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.608629e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.982970e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.585600e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.993921e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317668354515] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6456s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4771s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1686s for 8192 events => throughput is 4.86E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3955s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1340s for 8192 events => throughput is 6.11E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607748863) differ by less than 3E-14 (2.453592884421596e-14) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354515) differ by less than 3E-14 (2.475797344914099e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.4996s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6435s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8561s for 90112 events => throughput is 4.85E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252514E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 2.8244s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3455s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.4789s for 90112 events => throughput is 6.09E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252514E-002) differ by less than 3E-14 (5.551115123125783e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.966202e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.226693e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.937901e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.234741e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317668354763] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4803s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3957s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0846s for 8192 events => throughput is 9.68E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3931s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3261s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0670s for 8192 events => throughput is 1.22E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354763) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5052s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5758s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9294s for 90112 events => throughput is 9.70E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 2.0129s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2766s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7364s for 90112 events => throughput is 1.22E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252555E-002) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.913999e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.849874e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4790s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3971s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0818s for 8192 events => throughput is 1.00E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.3870s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5613s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8257s for 90112 events => throughput is 1.09E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.099230e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.295404e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.125635e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5283s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4199s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1084s for 8192 events => throughput is 7.55E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.290087e+05 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.7535s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5980s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1555s for 90112 events => throughput is 7.80E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.774058e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.841638e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317668354760] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7462s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7408s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.5889s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5814s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.10E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749111) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354760) differ by less than 3E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717736E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9272s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9044s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 1.5896s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5073s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0823s for 90112 events => throughput is 1.09E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717736E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252555E-002) differ by less than 3E-14 (0.0) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.632538e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.143071e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.097542e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.166208e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.673182e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.672252e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.241730e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.305892e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.666883e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.673543e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.250394e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.841579e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.680746e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.660325e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.758368e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.409698e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index b22193f403..6dbd1892b1 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:38:22 +DATE: 2024-03-03_16:51:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6628s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3454s - [COUNTERS] Fortran MEs ( 1 ) : 0.3174s for 8192 events => throughput is 2.58E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4859s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s + [COUNTERS] Fortran MEs ( 1 ) : 0.2024s for 8192 events => throughput is 4.05E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3130s - [COUNTERS] Fortran MEs ( 1 ) : 0.3165s for 8192 events => throughput is 2.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4634s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2604s + [COUNTERS] Fortran MEs ( 1 ) : 0.2030s for 8192 events => throughput is 4.04E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 4.9722s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s - [COUNTERS] Fortran MEs ( 1 ) : 3.4891s for 90112 events => throughput is 2.58E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 3.4305s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2012s + [COUNTERS] Fortran MEs ( 1 ) : 2.2293s for 90112 events => throughput is 4.04E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112722621426752] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112291597608296] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.9336s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6187s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3149s for 8192 events => throughput is 2.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7637s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5087s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2550s for 8192 events => throughput is 3.21E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112722621426752) differ by less than 4E-4 (2.569659680817793e-06) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112291597608296) differ by less than 4E-4 (2.5781178285555484e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238468310179624E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.3385s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8107s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5278s for 90112 events => throughput is 2.55E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239221732791437E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 4.2741s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4554s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.8186s for 90112 events => throughput is 3.20E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238468310179624E-002) differ by less than 4E-4 (1.719182115555995e-07) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239221732791437E-002) differ by less than 4E-4 (1.8599953477416165e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.649087e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.320467e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.678753e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.329030e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112720710186394] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112290421591680] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4987s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4044s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0944s for 8192 events => throughput is 8.68E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4095s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3340s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0755s for 8192 events => throughput is 1.08E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112720710186394) differ by less than 4E-4 (2.758652844936371e-06) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112290421591680) differ by less than 4E-4 (2.6944132867079418e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238454786658835E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5977s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5622s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0354s for 90112 events => throughput is 8.70E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239212368085274E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 2.1111s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2830s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8281s for 90112 events => throughput is 1.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238454786658835E-002) differ by less than 4E-4 (3.4258681169685445e-07) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239212368085274E-002) differ by less than 4E-4 (3.0418222529693395e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.791493e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.109626e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.818254e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.108708e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112291415112837] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4003s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3576s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3307s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2966s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0341s for 8192 events => throughput is 2.40E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721766950902) differ by less than 4E-4 (2.654154597325764e-06) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112291415112837) differ by less than 4E-4 (2.5961646764605106e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9911s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5156s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4755s for 90112 events => throughput is 1.90E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9239211617250407E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 1.6130s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2385s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3745s for 90112 events => throughput is 2.41E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002) differ by less than 4E-4 (3.5585866953180556e-07) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239211617250407E-002) differ by less than 4E-4 (3.136577692020026e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.915431e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.928091e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3910s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3509s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0400s for 8192 events => throughput is 2.05E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721766950902) differ by less than 4E-4 (2.654154597325764e-06) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9540s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5218s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4322s for 90112 events => throughput is 2.09E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002) differ by less than 4E-4 (3.5585866953180556e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.113903e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.450541e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.128293e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.471338e+05 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112723387847480] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4175s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3658s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0516s for 8192 events => throughput is 1.59E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112723387847480) differ by less than 4E-4 (2.4938721023826105e-06) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238464410949921E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0938s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5331s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5607s for 90112 events => throughput is 1.61E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464410949921E-002) differ by less than 4E-4 (2.211270000440635e-07) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.580486e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.544942e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112726034625694] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112292787307366] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7476s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7467s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.72E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.5768s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5749s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.14E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112726034625694) differ by less than 4E-4 (2.2321452152196386e-06) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112292787307366) differ by less than 4E-4 (2.4604693221741414e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238473828077680E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9141s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9047s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 90112 events => throughput is 9.57E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9239222545537072E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 1.5198s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4981s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 90112 events => throughput is 4.15E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238473828077680E-002) differ by less than 4E-4 (1.0228161673175862e-07) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239222545537072E-002) differ by less than 4E-4 (1.7574267630049434e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.317603e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.727519e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.855249e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.649552e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.653705e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.471880e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.471958e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.087978e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.666794e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.472857e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.507869e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.639473e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.515295e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.429181e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.625829e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.261281e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 994bc4f8f2..6d86d2497e 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:39:00 +DATE: 2024-03-03_16:52:15 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3573s - [COUNTERS] Fortran MEs ( 1 ) : 0.3178s for 8192 events => throughput is 2.58E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4851s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2827s + [COUNTERS] Fortran MEs ( 1 ) : 0.2023s for 8192 events => throughput is 4.05E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6388s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3205s - [COUNTERS] Fortran MEs ( 1 ) : 0.3183s for 8192 events => throughput is 2.57E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4619s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2590s + [COUNTERS] Fortran MEs ( 1 ) : 0.2029s for 8192 events => throughput is 4.04E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.0099s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5073s - [COUNTERS] Fortran MEs ( 1 ) : 3.5026s for 90112 events => throughput is 2.57E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 3.4326s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2053s + [COUNTERS] Fortran MEs ( 1 ) : 2.2273s for 90112 events => throughput is 4.05E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317761225882] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.9635s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6336s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3299s for 8192 events => throughput is 2.48E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8269s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5407s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2862s for 8192 events => throughput is 2.86E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748700702684) differ by less than 2E-4 (9.191721828116783e-09) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317761225882) differ by less than 2E-4 (9.183959592817814e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.4154s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7958s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6196s for 90112 events => throughput is 2.49E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239237217958461E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 4.6470s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4909s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.1561s for 90112 events => throughput is 2.86E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482679400354E-002) differ by less than 2E-4 (9.423232416594374e-09) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237217958461E-002) differ by less than 2E-4 (9.4234364755863e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.562106e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.921904e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.547562e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.929545e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748702805033] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317763556192] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6394s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1654s for 8192 events => throughput is 4.95E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5310s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3947s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1363s for 8192 events => throughput is 6.01E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748702805033) differ by less than 2E-4 (9.399612865834683e-09) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317763556192) differ by less than 2E-4 (9.41440236879032e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482683055667E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.4743s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6457s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8285s for 90112 events => throughput is 4.93E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239237221421968E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 2.8380s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3410s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.4970s for 90112 events => throughput is 6.02E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482683055667E-002) differ by less than 2E-4 (9.469362849401364e-09) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237221421968E-002) differ by less than 2E-4 (9.467145956065792e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.063467e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.207090e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.051938e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.204637e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317741957558] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4771s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3935s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 8192 events => throughput is 9.80E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3879s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3235s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0644s for 8192 events => throughput is 1.27E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415580) differ by less than 2E-4 (7.284514991212632e-09) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317741957558) differ by less than 2E-4 (7.278528668663853e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.4752s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5541s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9211s for 90112 events => throughput is 9.78E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9239237072275287E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 1.9796s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2698s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7097s for 90112 events => throughput is 1.27E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002) differ by less than 2E-4 (7.592642958798024e-09) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237072275287E-002) differ by less than 2E-4 (7.584913142011374e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.001861e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.840887e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4581s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3846s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0735s for 8192 events => throughput is 1.11E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415580) differ by less than 2E-4 (7.284514991212632e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.3626s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5499s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8127s for 90112 events => throughput is 1.11E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002) differ by less than 2E-4 (7.592642958798024e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.148207e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.309433e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.152825e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748700265108] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5403s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1136s for 8192 events => throughput is 7.21E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.313768e+05 ) sec^-1 -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748700265108) differ by less than 2E-4 (9.148451995955043e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238482666076374E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.7973s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6097s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1876s for 90112 events => throughput is 7.59E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482666076374E-002) differ by less than 2E-4 (9.255082034087536e-09) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.609614e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.592843e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112317662375726] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7459s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7405s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.5717s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5642s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748601943165) differ by less than 2E-4 (5.74121417074025e-10) +OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317662375726) differ by less than 2E-4 (5.9126292750733e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9191s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8964s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.96E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9239236476482192E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1899 events (found 1904 events) + [COUNTERS] PROGRAM TOTAL : 1.5944s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5120s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0824s for 90112 events => throughput is 1.09E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481937154381E-002) differ by less than 2E-4 (5.5991211667105745e-11) +OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236476482192E-002) differ by less than 2E-4 (6.599809587726213e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624489e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.137679e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.862423e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.166979e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.598562e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.672913e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.230160e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.303219e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.604858e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.672319e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.241022e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.840496e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.618302e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.661363e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.712384e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.406189e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 455a867420..a0a2aa6349 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:39:43 +DATE: 2024-03-03_16:52:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.5262s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3968s - [COUNTERS] Fortran MEs ( 1 ) : 4.1295s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.8610s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3678s + [COUNTERS] Fortran MEs ( 1 ) : 2.4932s for 8192 events => throughput is 3.29E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.4601s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3420s - [COUNTERS] Fortran MEs ( 1 ) : 4.1180s for 8192 events => throughput is 1.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2756s + [COUNTERS] Fortran MEs ( 1 ) : 2.4994s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 47.7126s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0517s - [COUNTERS] Fortran MEs ( 1 ) : 45.6608s for 90112 events => throughput is 1.97E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 29.0676s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5877s + [COUNTERS] Fortran MEs ( 1 ) : 27.4799s for 90112 events => throughput is 3.28E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143122579739E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 8.7056s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4601s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.2455s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 7.7622s + [COUNTERS] Fortran Overhead ( 0 ) : 3.9898s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7724s for 8192 events => throughput is 2.17E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102372E-004) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579739E-004) differ by less than 3E-14 (6.661338147750939e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 53.1561s - [COUNTERS] Fortran Overhead ( 0 ) : 6.1171s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.0390s for 90112 events => throughput is 1.92E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 46.7317s + [COUNTERS] Fortran Overhead ( 0 ) : 5.2704s + [COUNTERS] CudaCpp MEs ( 2 ) : 41.4613s for 90112 events => throughput is 2.17E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451704E-004) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.989312e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.255639e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.975004e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.251508e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143122579739E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.7773s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5170s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2604s for 8192 events => throughput is 3.62E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5765s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9114s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.6651s for 8192 events => throughput is 4.92E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579739E-004) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451701E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 29.0103s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1559s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.8544s for 90112 events => throughput is 3.63E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 21.5394s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2414s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.2979s for 90112 events => throughput is 4.92E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451701E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.801009e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.088805e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.781734e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.094091e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143122579728E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.2569s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2848s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9721s for 8192 events => throughput is 8.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.6557s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9594s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6963s for 8192 events => throughput is 1.18E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579728E-004) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 13.7501s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9426s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.8075s for 90112 events => throughput is 8.34E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 9.9655s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2947s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.6708s for 90112 events => throughput is 1.17E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.607758e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.615061e+03 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.0290s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1707s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8583s for 8192 events => throughput is 9.54E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 12.2922s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8302s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4620s for 90112 events => throughput is 9.52E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.701965e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.213870e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.814187e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.218553e+04 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.5040s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4024s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1017s for 8192 events => throughput is 7.44E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 14.7910s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0412s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.7498s for 90112 events => throughput is 7.67E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.831586e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.821061e+03 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143122579723E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8686s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8368s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.57E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9784s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8654s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 8192 events => throughput is 7.25E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579723E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.8233s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4732s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3501s for 90112 events => throughput is 2.57E+05 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914653E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 3.3306s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0862s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2445s for 90112 events => throughput is 7.24E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914653E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.280922e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.295001e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.518844e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.519639e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.106750e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.249001e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.162850e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.036607e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.106625e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.239941e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.168282e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.231726e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.107369e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.244867e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.430988e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.391147e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 5e945a4db8..acf204f2f8 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:43:57 +DATE: 2024-03-03_16:56:42 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.4989s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3403s - [COUNTERS] Fortran MEs ( 1 ) : 4.1586s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7942s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2798s + [COUNTERS] Fortran MEs ( 1 ) : 2.5144s for 8192 events => throughput is 3.26E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.4650s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3366s - [COUNTERS] Fortran MEs ( 1 ) : 4.1284s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7719s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2755s + [COUNTERS] Fortran MEs ( 1 ) : 2.4964s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 47.5707s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0343s - [COUNTERS] Fortran MEs ( 1 ) : 45.5364s for 90112 events => throughput is 1.98E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 29.0770s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5941s + [COUNTERS] Fortran MEs ( 1 ) : 27.4828s for 90112 events => throughput is 3.28E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703729438336302E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704259755238570E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 8.4568s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3045s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.1523s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 6.9402s + [COUNTERS] Fortran Overhead ( 0 ) : 3.5788s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.3614s for 8192 events => throughput is 2.44E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703729438336302E-004) differ by less than 4E-4 (3.021119383106452e-06) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704259755238570E-004) differ by less than 4E-4 (3.0134411834747965e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793486626492658E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 51.1261s - [COUNTERS] Fortran Overhead ( 0 ) : 5.9844s - [COUNTERS] CudaCpp MEs ( 2 ) : 45.1417s for 90112 events => throughput is 2.00E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793580182117605E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 41.8891s + [COUNTERS] Fortran Overhead ( 0 ) : 4.9220s + [COUNTERS] CudaCpp MEs ( 2 ) : 36.9671s for 90112 events => throughput is 2.44E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486626492658E-004) differ by less than 4E-4 (3.0382263187522796e-06) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793580182117605E-004) differ by less than 4E-4 (3.024668687290344e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.070377e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.491248e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.032691e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.495341e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703722581317850E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704254541054809E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.5531s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4379s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1153s for 8192 events => throughput is 7.35E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.9396s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1020s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8376s for 8192 events => throughput is 9.78E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722581317850E-004) differ by less than 4E-4 (2.843951981690296e-06) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704254541054809E-004) differ by less than 4E-4 (2.8787221757475834e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793483759856148E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 15.4011s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1124s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.2887s for 90112 events => throughput is 7.33E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793578161882866E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 11.5930s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4020s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.1910s for 90112 events => throughput is 9.80E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483759856148E-004) differ by less than 4E-4 (2.856718252175483e-06) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793578161882866E-004) differ by less than 4E-4 (2.896753368286653e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.468143e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.009110e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.493623e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.004329e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704254166302247E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.3122s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8184s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4938s for 8192 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9791s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6276s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3515s for 8192 events => throughput is 2.33E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722425602170E-004) differ by less than 4E-4 (2.8399286962077497e-06) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704254166302247E-004) differ by less than 4E-4 (2.8690396836061893e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 7.8863s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4589s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4274s for 90112 events => throughput is 1.66E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793578009696313E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 5.8051s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9601s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8450s for 90112 events => throughput is 2.34E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004) differ by less than 4E-4 (2.852825495613942e-06) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793578009696313E-004) differ by less than 4E-4 (2.887117363403746e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.689224e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.712522e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.1887s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7547s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4340s for 8192 events => throughput is 1.89E+04 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722425602170E-004) differ by less than 4E-4 (2.8399286962077497e-06) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 7.2113s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4166s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.7946s for 90112 events => throughput is 1.88E+04 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004) differ by less than 4E-4 (2.852825495613942e-06) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.812765e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.420571e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.800388e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.416575e+04 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703728658657426E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.4119s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8827s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5292s for 8192 events => throughput is 1.55E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703728658657426E-004) differ by less than 4E-4 (3.0009745224379714e-06) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793486977281547E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 8.3753s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5229s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.8525s for 90112 events => throughput is 1.54E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486977281547E-004) differ by less than 4E-4 (3.0604373708609245e-06) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.556546e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.565832e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703736267486325E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704261630635685E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8334s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8120s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8636s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8082s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0555s for 8192 events => throughput is 1.48E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703736267486325E-004) differ by less than 4E-4 (3.1975667371675343e-06) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704261630635685E-004) differ by less than 4E-4 (3.0618958697381515e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793489323670813E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.7017s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4654s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2363s for 90112 events => throughput is 3.81E+05 events/s + [XSECTION] Cross section = 0.0001579 [1.5793580869662166E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 2.6027s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9914s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6113s for 90112 events => throughput is 1.47E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793489323670813E-004) differ by less than 4E-4 (3.20900471706409e-06) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793580869662166E-004) differ by less than 4E-4 (3.0682019858119247e-06) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.592263e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.492169e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.940482e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.854767e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.499807e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.716188e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.638317e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.317447e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.497540e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.712158e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.635301e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.068831e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.483569e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.710239e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.518477e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.426106e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 4a1ef98d00..26322b0196 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:47:17 +DATE: 2024-03-03_16:59:48 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.4720s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3418s - [COUNTERS] Fortran MEs ( 1 ) : 4.1302s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7778s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2789s + [COUNTERS] Fortran MEs ( 1 ) : 2.4990s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.4586s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3357s - [COUNTERS] Fortran MEs ( 1 ) : 4.1229s for 8192 events => throughput is 1.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7715s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2749s + [COUNTERS] Fortran MEs ( 1 ) : 2.4966s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 47.6222s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0494s - [COUNTERS] Fortran MEs ( 1 ) : 45.5728s for 90112 events => throughput is 1.98E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 29.0872s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6251s + [COUNTERS] Fortran MEs ( 1 ) : 27.4620s for 90112 events => throughput is 3.28E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612659176674E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143272044121E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 8.7912s - [COUNTERS] Fortran Overhead ( 0 ) : 4.5114s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.2799s for 8192 events => throughput is 1.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 7.7541s + [COUNTERS] Fortran Overhead ( 0 ) : 3.9555s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7985s for 8192 events => throughput is 2.16E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612659176674E-004) differ by less than 2E-4 (3.851690077993908e-09) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143272044121E-004) differ by less than 2E-4 (3.861716058040088e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438704534934E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 53.4090s - [COUNTERS] Fortran Overhead ( 0 ) : 6.1734s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.2356s for 90112 events => throughput is 1.91E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532474032691E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 47.0949s + [COUNTERS] Fortran Overhead ( 0 ) : 5.2971s + [COUNTERS] CudaCpp MEs ( 2 ) : 41.7978s for 90112 events => throughput is 2.16E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438704534934E-004) differ by less than 2E-4 (3.930950231989527e-09) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532474032691E-004) differ by less than 2E-4 (3.933131154099101e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.968066e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.227302e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.968245e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.229561e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612692816703E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143304774347E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.7232s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5040s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2192s for 8192 events => throughput is 3.69E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5587s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9184s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.6403s for 8192 events => throughput is 4.99E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612692816703E-004) differ by less than 2E-4 (4.720860369289426e-09) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143304774347E-004) differ by less than 2E-4 (4.707367828871156e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438707226035E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 28.6711s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1739s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.4972s for 90112 events => throughput is 3.68E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532476698221E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 21.2170s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1859s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.0311s for 90112 events => throughput is 5.00E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438707226035E-004) differ by less than 2E-4 (4.1013439311399225e-09) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532476698221E-004) differ by less than 2E-4 (4.101904815811963e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.727620e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.158008e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.685802e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.137988e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143287857844E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.2625s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2738s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9887s for 8192 events => throughput is 8.29E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.6233s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9443s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6790s for 8192 events => throughput is 1.21E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143287857844E-004) differ by less than 2E-4 (4.2702956726259345e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 13.6031s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9396s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.6635s for 90112 events => throughput is 8.45E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532473043530E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 9.7193s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2475s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.4718s for 90112 events => throughput is 1.21E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532473043530E-004) differ by less than 2E-4 (3.870500364655527e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.715236e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.685374e+03 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.0253s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1676s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8577s for 8192 events => throughput is 9.55E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 12.2295s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8222s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4074s for 90112 events => throughput is 9.58E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.886999e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.231500e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.910216e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.231333e+04 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.4883s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0796s for 8192 events => throughput is 7.59E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 15.1764s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0860s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.0904s for 90112 events => throughput is 7.45E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.643781e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.679757e+03 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612512203166E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8704143124638075E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8696s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8376s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0321s for 8192 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8875s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7736s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1139s for 8192 events => throughput is 7.19E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612512203166E-004) differ by less than 2E-4 (5.427946980773868e-11) +OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143124638075E-004) differ by less than 2E-4 (5.318190332559425e-11) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642387717E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.8559s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5071s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3489s for 90112 events => throughput is 2.58E+05 events/s + [XSECTION] Cross section = 0.0001579 [1.5793532411887058E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1358 events (found 1880 events) + [COUNTERS] PROGRAM TOTAL : 3.3445s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0956s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2489s for 90112 events => throughput is 7.22E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642387717E-004) differ by less than 2E-4 (4.051980972974434e-12) +OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411887058E-004) differ by less than 2E-4 (1.7474910407599964e-12) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.289596e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.300806e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.528638e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.521772e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.112086e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.248268e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.149032e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.028200e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.114551e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.248122e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.167728e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.232629e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.109912e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.249568e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.430504e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.379759e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 6ba33cd625..88cac494c1 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:53:01 +DATE: 2024-03-03_16:54:11 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.0689s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5400s - [COUNTERS] Fortran MEs ( 1 ) : 95.5289s for 8192 events => throughput is 8.58E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.6134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4697s + [COUNTERS] Fortran MEs ( 1 ) : 54.1437s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.2818s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4823s - [COUNTERS] Fortran MEs ( 1 ) : 95.7994s for 8192 events => throughput is 8.55E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.6041s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4623s + [COUNTERS] Fortran MEs ( 1 ) : 54.1418s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1058.3505s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1547s - [COUNTERS] Fortran MEs ( 1 ) : 1054.1958s for 90112 events => throughput is 8.55E+01 events/s + [COUNTERS] PROGRAM TOTAL : 598.7866s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0634s + [COUNTERS] Fortran MEs ( 1 ) : 595.7233s for 90112 events => throughput is 1.51E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403628942015001E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 212.3366s - [COUNTERS] Fortran Overhead ( 0 ) : 99.0477s - [COUNTERS] CudaCpp MEs ( 2 ) : 113.2889s for 8192 events => throughput is 7.23E+01 events/s + [COUNTERS] PROGRAM TOTAL : 174.5096s + [COUNTERS] Fortran Overhead ( 0 ) : 79.6890s + [COUNTERS] CudaCpp MEs ( 2 ) : 94.8205s for 8192 events => throughput is 8.64E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939193E-006) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015001E-006) differ by less than 3E-14 (2.4424906541753444e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085453E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1356.0370s - [COUNTERS] Fortran Overhead ( 0 ) : 104.1787s - [COUNTERS] CudaCpp MEs ( 2 ) : 1251.8583s for 90112 events => throughput is 7.20E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1131.3065s + [COUNTERS] Fortran Overhead ( 0 ) : 82.0836s + [COUNTERS] CudaCpp MEs ( 2 ) : 1049.2229s for 90112 events => throughput is 8.59E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085453E-007) differ by less than 3E-14 (1.5543122344752192e-15) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.154156e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.035526e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.197434e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.037699e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939197E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403628942015003E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 107.3498s - [COUNTERS] Fortran Overhead ( 0 ) : 49.5738s - [COUNTERS] CudaCpp MEs ( 2 ) : 57.7759s for 8192 events => throughput is 1.42E+02 events/s + [COUNTERS] PROGRAM TOTAL : 81.7893s + [COUNTERS] Fortran Overhead ( 0 ) : 36.9957s + [COUNTERS] CudaCpp MEs ( 2 ) : 44.7936s for 8192 events => throughput is 1.83E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939197E-006) differ by less than 3E-14 (1.7763568394002505e-15) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015003E-006) differ by less than 3E-14 (2.6645352591003757e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656017E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085448E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 690.9132s - [COUNTERS] Fortran Overhead ( 0 ) : 53.4647s - [COUNTERS] CudaCpp MEs ( 2 ) : 637.4485s for 90112 events => throughput is 1.41E+02 events/s + [COUNTERS] PROGRAM TOTAL : 531.7217s + [COUNTERS] Fortran Overhead ( 0 ) : 39.5074s + [COUNTERS] CudaCpp MEs ( 2 ) : 492.2143s for 90112 events => throughput is 1.83E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656017E-007) differ by less than 3E-14 (2.220446049250313e-15) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085448E-007) differ by less than 3E-14 (1.3322676295501878e-15) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.672791e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.258682e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.670748e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.250805e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403628942015001E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 49.9431s - [COUNTERS] Fortran Overhead ( 0 ) : 23.2154s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.7277s for 8192 events => throughput is 3.06E+02 events/s + [COUNTERS] PROGRAM TOTAL : 35.2764s + [COUNTERS] Fortran Overhead ( 0 ) : 16.0692s + [COUNTERS] CudaCpp MEs ( 2 ) : 19.2071s for 8192 events => throughput is 4.27E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015001E-006) differ by less than 3E-14 (2.4424906541753444e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085445E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 318.2044s - [COUNTERS] Fortran Overhead ( 0 ) : 26.8024s - [COUNTERS] CudaCpp MEs ( 2 ) : 291.4019s for 90112 events => throughput is 3.09E+02 events/s + [COUNTERS] PROGRAM TOTAL : 228.9946s + [COUNTERS] Fortran Overhead ( 0 ) : 18.5816s + [COUNTERS] CudaCpp MEs ( 2 ) : 210.4130s for 90112 events => throughput is 4.28E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085445E-007) differ by less than 3E-14 (1.1102230246251565e-15) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.618074e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.618894e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 44.2064s - [COUNTERS] Fortran Overhead ( 0 ) : 20.3467s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.8597s for 8192 events => throughput is 3.43E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 291.1048s - [COUNTERS] Fortran Overhead ( 0 ) : 24.2318s - [COUNTERS] CudaCpp MEs ( 2 ) : 266.8729s for 90112 events => throughput is 3.38E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.097914e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.282079e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.125731e+02 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 45.8566s - [COUNTERS] Fortran Overhead ( 0 ) : 22.2857s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.5710s for 8192 events => throughput is 3.48E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.275234e+02 ) sec^-1 -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 285.1342s - [COUNTERS] Fortran Overhead ( 0 ) : 26.2120s - [COUNTERS] CudaCpp MEs ( 2 ) : 258.9222s for 90112 events => throughput is 3.48E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.725410e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.772387e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403628942015003E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 4.2510s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1660s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0850s for 8192 events => throughput is 7.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.7985s + [COUNTERS] Fortran Overhead ( 0 ) : 7.9861s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8124s for 8192 events => throughput is 2.15E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.7763568394002505e-15) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015003E-006) differ by less than 3E-14 (2.6645352591003757e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085437E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 18.8198s - [COUNTERS] Fortran Overhead ( 0 ) : 6.9183s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9015s for 90112 events => throughput is 7.57E+03 events/s + [COUNTERS] PROGRAM TOTAL : 52.1048s + [COUNTERS] Fortran Overhead ( 0 ) : 10.1597s + [COUNTERS] CudaCpp MEs ( 2 ) : 41.9451s for 90112 events => throughput is 2.15E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.7763568394002505e-15) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085437E-007) differ by less than 3E-14 (8.881784197001252e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.527080e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.179470e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.239391e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.192534e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.271267e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.561647e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.600243e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.458311e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.245889e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.554068e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.476521e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.511239e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.229131e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.545851e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.234312e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.123455e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 2b7ca2c190..d41090c8e2 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_05:18:49 +DATE: 2024-03-03_17:49:34 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.8320s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s - [COUNTERS] Fortran MEs ( 1 ) : 96.3581s for 8192 events => throughput is 8.50E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.5124s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3701s + [COUNTERS] Fortran MEs ( 1 ) : 54.1423s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.1294s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4800s - [COUNTERS] Fortran MEs ( 1 ) : 95.6494s for 8192 events => throughput is 8.56E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.5924s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4250s + [COUNTERS] Fortran MEs ( 1 ) : 54.1674s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1058.3011s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1783s - [COUNTERS] Fortran MEs ( 1 ) : 1054.1228s for 90112 events => throughput is 8.55E+01 events/s + [COUNTERS] PROGRAM TOTAL : 598.2306s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0375s + [COUNTERS] Fortran MEs ( 1 ) : 595.1931s for 90112 events => throughput is 1.51E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405719957040752E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405363572559468E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 197.7089s - [COUNTERS] Fortran Overhead ( 0 ) : 90.3714s - [COUNTERS] CudaCpp MEs ( 2 ) : 107.3375s for 8192 events => throughput is 7.63E+01 events/s + [COUNTERS] PROGRAM TOTAL : 161.5735s + [COUNTERS] Fortran Overhead ( 0 ) : 74.1579s + [COUNTERS] CudaCpp MEs ( 2 ) : 87.4157s for 8192 events => throughput is 9.37E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719957040752E-006) differ by less than 4E-4 (0.00013985256106807675) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405363572559468E-006) differ by less than 4E-4 (0.00013984863241267576) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326290771198648E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326080615569212E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1274.0074s - [COUNTERS] Fortran Overhead ( 0 ) : 94.0944s - [COUNTERS] CudaCpp MEs ( 2 ) : 1179.9131s for 90112 events => throughput is 7.64E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1038.0662s + [COUNTERS] Fortran Overhead ( 0 ) : 76.7283s + [COUNTERS] CudaCpp MEs ( 2 ) : 961.3379s for 90112 events => throughput is 9.37E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290771198648E-007) differ by less than 4E-4 (0.00014139199589124907) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326080615569212E-007) differ by less than 4E-4 (0.00014136252059526733) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.108865e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.113338e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.128078e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.111807e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405717007921116E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405361288903015E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 49.6519s - [COUNTERS] Fortran Overhead ( 0 ) : 23.3946s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.2573s for 8192 events => throughput is 3.12E+02 events/s + [COUNTERS] PROGRAM TOTAL : 39.5402s + [COUNTERS] Fortran Overhead ( 0 ) : 18.1739s + [COUNTERS] CudaCpp MEs ( 2 ) : 21.3663s for 8192 events => throughput is 3.83E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405717007921116E-006) differ by less than 4E-4 (0.00013961480525170877) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405361288903015E-006) differ by less than 4E-4 (0.0001396645204514435) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326284900828787E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326076878598447E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 315.8806s - [COUNTERS] Fortran Overhead ( 0 ) : 27.1593s - [COUNTERS] CudaCpp MEs ( 2 ) : 288.7213s for 90112 events => throughput is 3.12E+02 events/s + [COUNTERS] PROGRAM TOTAL : 255.2177s + [COUNTERS] Fortran Overhead ( 0 ) : 20.6553s + [COUNTERS] CudaCpp MEs ( 2 ) : 234.5624s for 90112 events => throughput is 3.84E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326284900828787E-007) differ by less than 4E-4 (0.00014114029707035236) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326076878598447E-007) differ by less than 4E-4 (0.00014120229226155523) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.581780e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.647702e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.565199e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.648968e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405360895331841E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 25.4788s - [COUNTERS] Fortran Overhead ( 0 ) : 11.8981s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.5807s for 8192 events => throughput is 6.03E+02 events/s + [COUNTERS] PROGRAM TOTAL : 17.7758s + [COUNTERS] Fortran Overhead ( 0 ) : 8.2128s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.5629s for 8192 events => throughput is 8.57E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716659252656E-006) differ by less than 4E-4 (0.00013958669586155992) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405360895331841E-006) differ by less than 4E-4 (0.00013963279012663143) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326069099562333E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 165.7549s - [COUNTERS] Fortran Overhead ( 0 ) : 15.4780s - [COUNTERS] CudaCpp MEs ( 2 ) : 150.2769s for 90112 events => throughput is 6.00E+02 events/s + [COUNTERS] PROGRAM TOTAL : 115.7341s + [COUNTERS] Fortran Overhead ( 0 ) : 10.7789s + [COUNTERS] CudaCpp MEs ( 2 ) : 104.9552s for 90112 events => throughput is 8.59E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007) differ by less than 4E-4 (0.00014080311959907554) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326069099562333E-007) differ by less than 4E-4 (0.00014086875419705436) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.259920e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.259066e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 22.3180s - [COUNTERS] Fortran Overhead ( 0 ) : 10.3786s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9393s for 8192 events => throughput is 6.86E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716659252656E-006) differ by less than 4E-4 (0.00013958669586155992) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 145.4310s - [COUNTERS] Fortran Overhead ( 0 ) : 14.1732s - [COUNTERS] CudaCpp MEs ( 2 ) : 131.2578s for 90112 events => throughput is 6.87E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007) differ by less than 4E-4 (0.00014080311959907554) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.296906e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.061906e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.301383e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.052058e+03 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405719306052570E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 23.0558s - [COUNTERS] Fortran Overhead ( 0 ) : 11.3644s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.6914s for 8192 events => throughput is 7.01E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719306052570E-006) differ by less than 4E-4 (0.00013980007888836354) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326283660088769E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 144.1559s - [COUNTERS] Fortran Overhead ( 0 ) : 15.2893s - [COUNTERS] CudaCpp MEs ( 2 ) : 128.8666s for 90112 events => throughput is 6.99E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326283660088769E-007) differ by less than 4E-4 (0.00014108709892313165) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.554413e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.557969e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405722175509512E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405363557292459E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 2.4934s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9950s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4985s for 8192 events => throughput is 1.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 6.2015s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3978s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8037s for 8192 events => throughput is 4.54E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405722175509512E-006) differ by less than 4E-4 (0.00014003141235829908) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405363557292459E-006) differ by less than 4E-4 (0.00013984740156258724) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326296967941821E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326074784076956E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 11.1120s - [COUNTERS] Fortran Overhead ( 0 ) : 5.7089s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4031s for 90112 events => throughput is 1.67E+04 events/s + [COUNTERS] PROGRAM TOTAL : 26.7982s + [COUNTERS] Fortran Overhead ( 0 ) : 6.8766s + [COUNTERS] CudaCpp MEs ( 2 ) : 19.9216s for 90112 events => throughput is 4.52E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326296967941821E-007) differ by less than 4E-4 (0.0001416576883412901) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326074784076956E-007) differ by less than 4E-4 (0.00014111248645076735) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.650610e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.541018e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.632591e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.541711e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.339184e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.381750e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.373598e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.509825e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.323596e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.374728e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.361104e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.108824e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.325481e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.391979e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.425348e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.092226e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 99d7cfbcd5..0ce4090d7a 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_06:24:34 +DATE: 2024-03-03_18:33:43 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.2156s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s - [COUNTERS] Fortran MEs ( 1 ) : 95.7357s for 8192 events => throughput is 8.56E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.7954s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3694s + [COUNTERS] Fortran MEs ( 1 ) : 54.4260s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.1318s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s - [COUNTERS] Fortran MEs ( 1 ) : 95.6519s for 8192 events => throughput is 8.56E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.5671s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3751s + [COUNTERS] Fortran MEs ( 1 ) : 54.1919s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1057.5728s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1537s - [COUNTERS] Fortran MEs ( 1 ) : 1053.4191s for 90112 events => throughput is 8.55E+01 events/s + [COUNTERS] PROGRAM TOTAL : 598.9863s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0430s + [COUNTERS] Fortran MEs ( 1 ) : 595.9433s for 90112 events => throughput is 1.51E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985299359844E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403629013416990E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 220.4361s - [COUNTERS] Fortran Overhead ( 0 ) : 102.4490s - [COUNTERS] CudaCpp MEs ( 2 ) : 117.9870s for 8192 events => throughput is 6.94E+01 events/s + [COUNTERS] PROGRAM TOTAL : 175.2429s + [COUNTERS] Fortran Overhead ( 0 ) : 80.2048s + [COUNTERS] CudaCpp MEs ( 2 ) : 95.0382s for 8192 events => throughput is 8.62E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985299359844E-006) differ by less than 2E-4 (5.7578810608305275e-09) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629013416990E-006) differ by less than 2E-4 (5.7565425759520394e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783773791503E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1422.8276s - [COUNTERS] Fortran Overhead ( 0 ) : 106.0198s - [COUNTERS] CudaCpp MEs ( 2 ) : 1316.8079s for 90112 events => throughput is 6.84E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1125.3684s + [COUNTERS] Fortran Overhead ( 0 ) : 82.6919s + [COUNTERS] CudaCpp MEs ( 2 ) : 1042.6765s for 90112 events => throughput is 8.64E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993212353001E-007) differ by less than 2E-4 (5.389404034161771e-09) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783773791503E-007) differ by less than 2E-4 (5.389840573855054e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.035940e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.023694e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.018960e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.034350e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985295828471E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403629009850969E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 110.5022s - [COUNTERS] Fortran Overhead ( 0 ) : 50.8167s - [COUNTERS] CudaCpp MEs ( 2 ) : 59.6855s for 8192 events => throughput is 1.37E+02 events/s + [COUNTERS] PROGRAM TOTAL : 78.5615s + [COUNTERS] Fortran Overhead ( 0 ) : 35.1164s + [COUNTERS] CudaCpp MEs ( 2 ) : 43.4451s for 8192 events => throughput is 1.89E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985295828471E-006) differ by less than 2E-4 (5.473184350179849e-09) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629009850969E-006) differ by less than 2E-4 (5.469044328521022e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993222645653E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783784120318E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 715.3882s - [COUNTERS] Fortran Overhead ( 0 ) : 54.5501s - [COUNTERS] CudaCpp MEs ( 2 ) : 660.8381s for 90112 events => throughput is 1.36E+02 events/s + [COUNTERS] PROGRAM TOTAL : 515.7699s + [COUNTERS] Fortran Overhead ( 0 ) : 37.8592s + [COUNTERS] CudaCpp MEs ( 2 ) : 477.9107s for 90112 events => throughput is 1.89E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222645653E-007) differ by less than 2E-4 (5.830713245558172e-09) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783784120318E-007) differ by less than 2E-4 (5.832704319530535e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.628879e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.364562e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.636164e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.366448e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403629007633195E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 48.5744s - [COUNTERS] Fortran Overhead ( 0 ) : 22.1801s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.3943s for 8192 events => throughput is 3.10E+02 events/s + [COUNTERS] PROGRAM TOTAL : 34.2387s + [COUNTERS] Fortran Overhead ( 0 ) : 15.3454s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.8933s for 8192 events => throughput is 4.34E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629007633195E-006) differ by less than 2E-4 (5.290244020628165e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783783946155E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 319.2663s - [COUNTERS] Fortran Overhead ( 0 ) : 26.0078s - [COUNTERS] CudaCpp MEs ( 2 ) : 293.2585s for 90112 events => throughput is 3.07E+02 events/s + [COUNTERS] PROGRAM TOTAL : 227.2672s + [COUNTERS] Fortran Overhead ( 0 ) : 17.8656s + [COUNTERS] CudaCpp MEs ( 2 ) : 209.4016s for 90112 events => throughput is 4.30E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783783946155E-007) differ by less than 2E-4 (5.825236737422301e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.764546e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.773101e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 42.4540s - [COUNTERS] Fortran Overhead ( 0 ) : 19.2743s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.1797s for 8192 events => throughput is 3.53E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 277.3470s - [COUNTERS] Fortran Overhead ( 0 ) : 22.9193s - [COUNTERS] CudaCpp MEs ( 2 ) : 254.4277s for 90112 events => throughput is 3.54E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.384820e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.492842e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.391539e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.491012e+02 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 45.2143s - [COUNTERS] Fortran Overhead ( 0 ) : 21.9553s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.2589s for 8192 events => throughput is 3.52E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 278.0679s - [COUNTERS] Fortran Overhead ( 0 ) : 25.4000s - [COUNTERS] CudaCpp MEs ( 2 ) : 252.6680s for 90112 events => throughput is 3.57E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.828727e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.858416e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403628931370709E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 3.5884s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7239s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8645s for 8192 events => throughput is 9.48E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.2202s + [COUNTERS] Fortran Overhead ( 0 ) : 8.0547s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1655s for 8192 events => throughput is 1.97E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985217419736E-006) differ by less than 2E-4 (8.480691704448873e-10) +OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628931370709E-006) differ by less than 2E-4 (8.581571009358413e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993078576733E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322783640044522E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 15.9902s - [COUNTERS] Fortran Overhead ( 0 ) : 6.4881s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.5020s for 90112 events => throughput is 9.48E+03 events/s + [COUNTERS] PROGRAM TOTAL : 56.4471s + [COUNTERS] Fortran Overhead ( 0 ) : 10.5120s + [COUNTERS] CudaCpp MEs ( 2 ) : 45.9352s for 90112 events => throughput is 1.96E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993078576733E-007) differ by less than 2E-4 (3.464063480507207e-10) +OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783640044522E-007) differ by less than 2E-4 (3.447657714872321e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.411937e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.973675e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.083264e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.002670e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.112113e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.263024e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.161038e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.394768e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111465e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.327371e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.105445e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.281455e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.112837e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.327509e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.656493e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.081275e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 8e9ad5ba7a..918e7c2a67 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_03:51:32 +DATE: 2024-03-03_17:03:36 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4944s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s - [COUNTERS] Fortran MEs ( 1 ) : 0.0697s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4547s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4066s + [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3864s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3169s - [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3243s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2762s + [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.2522s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4897s - [COUNTERS] Fortran MEs ( 1 ) : 0.7625s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7846s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2591s + [COUNTERS] Fortran MEs ( 1 ) : 0.5255s for 90112 events => throughput is 1.71E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4681s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3922s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0759s for 8192 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4354s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3662s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0693s for 8192 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263335) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166922) differ by less than 3E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561293] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679754343820] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.3698s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5419s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8279s for 90112 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0569s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7647s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561293) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343820) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.084897e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.204533e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.103096e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.208719e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351262530] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226551166122] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4004s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3592s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3320s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2986s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0335s for 8192 events => throughput is 2.45E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351262530) differ by less than 3E-14 (2.9531932455029164e-14) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166122) differ by less than 3E-14 (2.9531932455029164e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561281] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9658s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5172s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4486s for 90112 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6201s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2532s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3670s for 90112 events => throughput is 2.46E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561281) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343823) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.019219e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.474720e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.018294e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.504610e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3643s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3408s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3021s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2850s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.77E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166922) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7585s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4994s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2591s for 90112 events => throughput is 3.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4255s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2356s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1899s for 90112 events => throughput is 4.74E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343823) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.297018e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.848976e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.427747e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3848s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3623s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0225s for 8192 events => throughput is 3.65E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8132s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5645s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2486s for 90112 events => throughput is 3.62E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.905513e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.846140e+05 ) sec^-1 -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.866043e+05 ) sec^-1 +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3815s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3495s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.56E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8893s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5364s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3529s for 90112 events => throughput is 2.55E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.640953e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.543334e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263363] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7465s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7458s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263363) differ by less than 3E-14 (1.3322676295501878e-15) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561304] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9068s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8993s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.20E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561304) differ by less than 3E-14 (4.440892098500626e-16) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.589846e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.058801e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.383441e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.512285e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.382616e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.771039e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.376307e+07 ) sec^-1 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.776386e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 63166c80e0..e0426fd000 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_03:52:02 +DATE: 2024-03-03_17:03:57 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4536s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3837s - [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3591s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3110s + [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3907s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3210s - [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3133s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2652s + [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.2714s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5019s - [COUNTERS] Fortran MEs ( 1 ) : 0.7695s for 90112 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7408s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2155s + [COUNTERS] Fortran MEs ( 1 ) : 0.5254s for 90112 events => throughput is 1.72E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110463093540638] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110149549279866] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4586s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3882s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0704s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3789s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3226s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0563s for 8192 events => throughput is 1.46E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110463093540638) differ by less than 4E-4 (2.812844174915341e-06) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110149549279866) differ by less than 4E-4 (2.840326210895583e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686273216112] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510678843355344] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.3150s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5373s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7777s for 90112 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9062s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2863s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6199s for 90112 events => throughput is 1.45E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686273216112) differ by less than 4E-4 (1.3172298474195543e-08) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510678843355344) differ by less than 4E-4 (4.2350520312872675e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.170698e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.483641e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.161745e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.487281e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110459152958460] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110146988852984] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3657s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3405s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0252s for 8192 events => throughput is 3.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3063s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2863s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0200s for 8192 events => throughput is 4.10E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110459152958460) differ by less than 4E-4 (2.9581965829139634e-06) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110146988852984) differ by less than 4E-4 (2.934771267448788e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510683016166510] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510676993136629] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7697s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4943s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2753s for 90112 events => throughput is 3.27E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4589s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2387s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2202s for 90112 events => throughput is 4.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510683016166510) differ by less than 4E-4 (1.6458771667782202e-07) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510676993136629) differ by less than 4E-4 (1.2836447871311663e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.219045e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.266778e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.229652e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.196387e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110148793566186] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3421s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3299s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2845s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2748s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0097s for 8192 events => throughput is 8.47E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460595003461) differ by less than 4E-4 (2.9050052766654844e-06) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110148793566186) differ by less than 4E-4 (2.8682018052839098e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510676419088856] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6208s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4844s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1364s for 90112 events => throughput is 6.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3376s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2310s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1067s for 90112 events => throughput is 8.45E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ by less than 4E-4 (1.8848637739488083e-07) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510676419088856) differ by less than 4E-4 (1.5505111905511626e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.431027e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.755084e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.412727e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3415s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3300s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0116s for 8192 events => throughput is 7.08E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460595003461) differ by less than 4E-4 (2.9050052766654844e-06) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6084s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1253s for 90112 events => throughput is 7.19E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.808083e+05 ) sec^-1 -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ by less than 4E-4 (1.8848637739488083e-07) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.891581e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.928440e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110464176080312] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3483s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3329s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.33E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110464176080312) differ by less than 4E-4 (2.772913590631809e-06) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510685411522326] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6561s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4840s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1721s for 90112 events => throughput is 5.24E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510685411522326) differ by less than 4E-4 (5.3231167917999755e-08) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.988554e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.962392e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110478167944563] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7423s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7418s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110478167944563) differ by less than 4E-4 (2.2568093527297606e-06) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510689885789414] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8968s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8910s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.53E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510689885789414) differ by less than 4E-4 (1.547708907700951e-07) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.824058e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.473484e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.891145e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.706092e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.798334e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.787777e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.356687e+07 ) sec^-1 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.028611e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index eb4ca92d13..e46d313ea3 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_03:52:30 +DATE: 2024-03-03_17:04:17 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4522s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3828s - [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3623s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3142s + [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3858s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3164s - [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3135s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2653s + [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.2499s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4895s - [COUNTERS] Fortran MEs ( 1 ) : 0.7604s for 90112 events => throughput is 1.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7475s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2214s + [COUNTERS] Fortran MEs ( 1 ) : 0.5261s for 90112 events => throughput is 1.71E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226549005623] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4694s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3943s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4039s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3346s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0693s for 8192 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348916002) differ by less than 2E-4 (8.658362915525686e-11) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226549005623) differ by less than 2E-4 (7.972267290767832e-11) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679758658835] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.4601s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6015s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8586s for 90112 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0500s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2907s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7593s for 90112 events => throughput is 1.19E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794337) differ by less than 2E-4 (1.967879192932287e-10) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679758658835) differ by less than 2E-4 (2.0059864880295208e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.100770e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.207970e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.090853e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.211751e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226549005628] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3940s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0394s for 8192 events => throughput is 2.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3342s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3008s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0334s for 8192 events => throughput is 2.45E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348916002) differ by less than 2E-4 (8.658362915525686e-11) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226549005628) differ by less than 2E-4 (7.972245086307339e-11) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679758658832] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9359s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5057s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4302s for 90112 events => throughput is 2.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6263s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2582s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3682s for 90112 events => throughput is 2.45E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794334) differ by less than 2E-4 (1.9678769724862377e-10) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679758658832) differ by less than 2E-4 (2.0059842675834716e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.020468e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.493733e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.027641e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.488023e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110226530029391] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3636s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3406s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3020s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2847s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.72E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) +OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226530029391) differ by less than 2E-4 (7.796884249344771e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510679756340242] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7468s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4923s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2545s for 90112 events => throughput is 3.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4279s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2391s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1888s for 90112 events => throughput is 4.77E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) +OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679756340242) differ by less than 2E-4 (9.281064805577444e-11) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.536848e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.847689e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.536744e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.864616e+05 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3573s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3372s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0201s for 8192 events => throughput is 4.08E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7304s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5047s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2257s for 90112 events => throughput is 3.99E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.887668e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.834847e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4046s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3689s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 8192 events => throughput is 2.30E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9542s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5763s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3779s for 90112 events => throughput is 2.38E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.510568e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.320811e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539343558537] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7473s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7466s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539343558537) differ by less than 2E-4 (2.8419910869104115e-10) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8944s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8868s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.19E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686553631395) differ by less than 2E-4 (1.3620671257541517e-10) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.579519e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.134868e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.391789e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.511629e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.394001e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.800973e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.396936e+07 ) sec^-1 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.776316e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 From ed81e86705a5b6ab5ad357ab3c8cc5796881b137 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 19:48:43 +0200 Subject: [PATCH 89/96] [susy2] go back to itscrd90 test logs git checkout f4d951c7ddfc635707c14e0fe5a0628fd4aec0ac tput/logs_* tmad/logs_* --- .../log_eemumu_mad_d_inl0_hrd0.txt | 412 +++++++++++----- .../log_eemumu_mad_f_inl0_hrd0.txt | 424 +++++++++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 402 +++++++++++----- .../log_ggtt_mad_d_inl0_hrd0.txt | 412 +++++++++++----- .../log_ggtt_mad_f_inl0_hrd0.txt | 420 ++++++++++------ .../log_ggtt_mad_m_inl0_hrd0.txt | 404 +++++++++++----- .../log_ggttg_mad_d_inl0_hrd0.txt | 430 +++++++++++------ .../log_ggttg_mad_f_inl0_hrd0.txt | 432 +++++++++++------ .../log_ggttg_mad_m_inl0_hrd0.txt | 428 +++++++++++------ .../log_ggttgg_mad_d_inl0_hrd0.txt | 432 +++++++++++------ .../log_ggttgg_mad_f_inl0_hrd0.txt | 432 +++++++++++------ .../log_ggttgg_mad_m_inl0_hrd0.txt | 432 +++++++++++------ .../log_ggttggg_mad_d_inl0_hrd0.txt | 426 +++++++++++------ .../log_ggttggg_mad_f_inl0_hrd0.txt | 420 ++++++++++------ .../log_ggttggg_mad_m_inl0_hrd0.txt | 420 ++++++++++------ .../log_gqttq_mad_d_inl0_hrd0.txt | 445 ++++++++++++----- .../log_gqttq_mad_f_inl0_hrd0.txt | 449 +++++++++++++----- .../log_gqttq_mad_m_inl0_hrd0.txt | 447 ++++++++++++----- .../log_eemumu_mad_d_inl0_hrd0.txt | 238 ++++++---- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 246 ++++++---- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 221 +++++---- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 241 ++++++---- .../log_eemumu_mad_d_inl0_hrd1.txt | 234 +++++---- .../log_eemumu_mad_d_inl1_hrd0.txt | 238 ++++++---- .../log_eemumu_mad_d_inl1_hrd1.txt | 238 ++++++---- .../log_eemumu_mad_f_inl0_hrd0.txt | 248 ++++++---- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 254 ++++++---- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 231 +++++---- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 249 ++++++---- .../log_eemumu_mad_f_inl0_hrd1.txt | 248 ++++++---- .../log_eemumu_mad_f_inl1_hrd0.txt | 248 ++++++---- .../log_eemumu_mad_f_inl1_hrd1.txt | 248 ++++++---- .../log_eemumu_mad_m_inl0_hrd0.txt | 234 +++++---- .../log_eemumu_mad_m_inl0_hrd1.txt | 234 +++++---- .../log_ggtt_mad_d_inl0_hrd0.txt | 238 ++++++---- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 246 ++++++---- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 221 +++++---- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 241 ++++++---- .../log_ggtt_mad_d_inl0_hrd1.txt | 234 +++++---- .../log_ggtt_mad_d_inl1_hrd0.txt | 238 ++++++---- .../log_ggtt_mad_d_inl1_hrd1.txt | 234 +++++---- .../log_ggtt_mad_f_inl0_hrd0.txt | 252 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 260 +++++----- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 241 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 255 +++++----- .../log_ggtt_mad_f_inl0_hrd1.txt | 252 ++++++---- .../log_ggtt_mad_f_inl1_hrd0.txt | 252 ++++++---- .../log_ggtt_mad_f_inl1_hrd1.txt | 252 ++++++---- .../log_ggtt_mad_m_inl0_hrd0.txt | 238 ++++++---- .../log_ggtt_mad_m_inl0_hrd1.txt | 238 ++++++---- .../log_ggttg_mad_d_inl0_hrd0.txt | 263 +++++----- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 273 ++++++----- .../log_ggttg_mad_d_inl0_hrd1.txt | 263 +++++----- .../log_ggttg_mad_f_inl0_hrd0.txt | 277 ++++++----- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 287 ++++++----- .../log_ggttg_mad_f_inl0_hrd1.txt | 277 ++++++----- .../log_ggttg_mad_m_inl0_hrd0.txt | 259 +++++----- .../log_ggttg_mad_m_inl0_hrd1.txt | 259 +++++----- .../log_ggttgg_mad_d_inl0_hrd0.txt | 263 +++++----- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 273 ++++++----- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 244 ++++++---- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 268 ++++++----- .../log_ggttgg_mad_d_inl0_hrd1.txt | 263 +++++----- .../log_ggttgg_mad_d_inl1_hrd0.txt | 271 ++++++----- .../log_ggttgg_mad_d_inl1_hrd1.txt | 271 ++++++----- .../log_ggttgg_mad_f_inl0_hrd0.txt | 279 ++++++----- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 289 ++++++----- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 270 ++++++----- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 284 ++++++----- .../log_ggttgg_mad_f_inl0_hrd1.txt | 277 ++++++----- .../log_ggttgg_mad_f_inl1_hrd0.txt | 275 ++++++----- .../log_ggttgg_mad_f_inl1_hrd1.txt | 275 ++++++----- .../log_ggttgg_mad_m_inl0_hrd0.txt | 259 +++++----- .../log_ggttgg_mad_m_inl0_hrd1.txt | 259 +++++----- .../log_ggttggg_mad_d_inl0_hrd0.txt | 259 +++++----- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 269 ++++++----- .../log_ggttggg_mad_d_inl0_hrd1.txt | 259 +++++----- .../log_ggttggg_mad_f_inl0_hrd0.txt | 275 ++++++----- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 285 ++++++----- .../log_ggttggg_mad_f_inl0_hrd1.txt | 275 ++++++----- .../log_ggttggg_mad_m_inl0_hrd0.txt | 259 +++++----- .../log_ggttggg_mad_m_inl0_hrd1.txt | 259 +++++----- .../log_gqttq_mad_d_inl0_hrd0.txt | 258 +++++++--- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 270 ++++++++--- .../log_gqttq_mad_d_inl0_hrd1.txt | 258 +++++++--- .../log_gqttq_mad_f_inl0_hrd0.txt | 258 +++++++--- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 270 ++++++++--- .../log_gqttq_mad_f_inl0_hrd1.txt | 258 +++++++--- .../log_gqttq_mad_m_inl0_hrd0.txt | 258 +++++++--- .../log_gqttq_mad_m_inl0_hrd1.txt | 258 +++++++--- 90 files changed, 16316 insertions(+), 9799 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 7e993f4ca8..fb2022a061 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-03_16:48:45 +DATE: 2024-03-01_03:35:28 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5695s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5635s - [COUNTERS] Fortran MEs ( 1 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6832s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6748s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.71E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1360s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1301s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1761s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1679s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2819s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2195s - [COUNTERS] Fortran MEs ( 1 ) : 0.0625s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3673s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s + [COUNTERS] Fortran MEs ( 1 ) : 0.0874s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1629s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1569s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 8192 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1745s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661545E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000780E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2919s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2266s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0653s for 90112 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3053s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0752s for 90112 events => throughput is 1.20E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000780E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.428280e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.174335e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.439855e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.235605e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1419s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1384s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1874s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.89E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661518E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2644s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2256s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0387s for 90112 events => throughput is 2.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3298s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2854s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0444s for 90112 events => throughput is 2.03E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.392008e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.003456e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.453554e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.071261e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,13 +286,13 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1398s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1374s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1738s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1708s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.75E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2500s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2227s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0273s for 90112 events => throughput is 3.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3165s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2833s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 90112 events => throughput is 2.71E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.423627e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.590204e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.552803e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.724231e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1747s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1718s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.78E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3170s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2851s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0318s for 90112 events => throughput is 2.83E+06 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.651963e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.888816e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1721s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 90112 events => throughput is 2.28E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002) differ by less than 3E-14 (3.3306690738754696e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.333417e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.247580e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.4804s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4800s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.80E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5894s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5889s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715404661545E-002) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,18 +538,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.5071s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5028s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 90112 events => throughput is 2.07E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7068s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7019s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -412,43 +560,43 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.186318e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.143768e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.600394e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.922192e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.242769e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.720542e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.894046e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.434610e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.247301e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.732238e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.953286e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.027929e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.191475e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.748145e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.549947e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.129848e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 302bb64830..130936da07 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-03_16:49:05 +DATE: 2024-03-01_03:35:44 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5089s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5030s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7004s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6920s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.73E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1370s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1311s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1752s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1674s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2805s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2181s - [COUNTERS] Fortran MEs ( 1 ) : 0.0624s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3760s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s + [COUNTERS] Fortran MEs ( 1 ) : 0.0872s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382701684199335E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382703205998396E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1451s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1398s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0053s for 8192 events => throughput is 1.54E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1795s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1733s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382701684199335E-002) differ by less than 4E-4 (1.4692721372888684e-07) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382703205998396E-002) differ by less than 4E-4 (1.306308462512007e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515588842633111E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515590123565249E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2812s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2244s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0567s for 90112 events => throughput is 1.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3578s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2889s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0690s for 90112 events => throughput is 1.31E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515588842633111E-002) differ by less than 4E-4 (1.439903947186849e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515590123565249E-002) differ by less than 4E-4 (1.2999352305698153e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.658570e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.296058e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.672811e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.289423e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382719831741665E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1382s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1361s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.84E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1759s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1734s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382719831741665E-002) differ by less than 4E-4 (4.740791825774693e-08) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700723828302E-002) differ by less than 4E-4 (1.5721146218172777e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515606481761602E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2446s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2212s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 90112 events => throughput is 3.84E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3141s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0281s for 90112 events => throughput is 3.21E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515606481761602E-002) differ by less than 4E-4 (4.875410031246474e-08) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587612890761E-002) differ by less than 4E-4 (1.5742791048545257e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.091998e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.247103e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.235774e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.346461e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382719700521907E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1378s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1361s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1759s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1735s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.49E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382719700521907E-002) differ by less than 4E-4 (4.6002735842876064e-08) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700679354239E-002) differ by less than 4E-4 (1.576877179942926e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515606480805645E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2404s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2208s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0196s for 90112 events => throughput is 4.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3181s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 90112 events => throughput is 3.50E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515606480805645E-002) differ by less than 4E-4 (4.874365444607065e-08) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002) differ by less than 4E-4 (1.573566908996682e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.829243e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.473027e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.779574e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1764s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1741s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382700679354239E-002) differ by less than 4E-4 (1.576877179942926e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2887s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0247s for 90112 events => throughput is 3.65E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002) differ by less than 4E-4 (1.573566908996682e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.393313e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.004255e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.850238e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382704335459282E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1726s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.46E+06 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382704335459282E-002) differ by less than 4E-4 (1.1853587900123586e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515591296252558E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2901s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0255s for 90112 events => throughput is 3.53E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591296252558E-002) differ by less than 4E-4 (1.1717945325173673e-07) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.340689e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.795181e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382704338101225E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.4176s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4173s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.62E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5865s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5861s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.74E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382704338101225E-002) differ by less than 4E-4 (1.1850758729892164e-07) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382706077425631E-002) differ by less than 4E-4 (9.988182347875352e-08) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515591361999701E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.5010s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4980s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 90112 events => throughput is 3.02E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7024s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 1.98E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591361999701E-002) differ by less than 4E-4 (1.1646102771045719e-07) +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515592892887687E-002) differ by less than 4E-4 (9.973286385633884e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.783310e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.528794e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.366509e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.178202e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.344857e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.848804e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.689472e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.051133e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.322265e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.014035e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.874276e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.222690e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.118728e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.412951e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.930551e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.409232e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index ef30c48422..da7367ae5e 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none + +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-03_16:49:25 +DATE: 2024-03-01_03:36:01 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.5406s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5347s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7189s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7106s + [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661518E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1371s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1312s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1780s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1693s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2805s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2180s - [COUNTERS] Fortran MEs ( 1 ) : 0.0625s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3702s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2823s + [COUNTERS] Fortran MEs ( 1 ) : 0.0879s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715420701395E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1492s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1432s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1808s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1742s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.717646025412023e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701395E-002) differ by less than 2E-4 (1.7176482458580722e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2915s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2259s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0656s for 90112 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3592s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0732s for 90112 events => throughput is 1.23E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.410977e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.182030e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.429208e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.222787e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,13 +210,13 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1413s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1379s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0034s for 8192 events => throughput is 2.41E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1746s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.07E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.717646025412023e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715420701354E-002) differ by less than 2E-4 (1.7176438049659737e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2603s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2229s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0374s for 90112 events => throughput is 2.41E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 90112 events => throughput is 2.14E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.470891e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.086150e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.528271e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.131619e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,13 +286,13 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1390s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1364s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.21E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1696s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.76E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484913930753692e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2498s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2215s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0283s for 90112 events => throughput is 3.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3229s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2892s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0338s for 90112 events => throughput is 2.67E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.332736e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.541763e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.448928e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.454900e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1830s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1799s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.61E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3167s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2848s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 90112 events => throughput is 2.83E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002) differ by less than 2E-4 (2.947131427788463e-11) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.677035e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.872617e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.1756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1722s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.36E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715383664494E-002) differ by less than 2E-4 (2.2484925032983938e-10) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.3270s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2890s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0380s for 90112 events => throughput is 2.37E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002) differ by less than 2E-4 (2.947131427788463e-11) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.248118e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.400436e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715392009222E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.4147s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 2.04E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5902s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5897s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.66E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.3382715404661518E-002) and cpp (9.3382715392009222E-002) differ by less than 2E-4 (1.3548862032308762e-10) +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715392009194E-002) differ by less than 2E-4 (1.3548906441229747e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,18 +538,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602021089631E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.5033s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4990s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 90112 events => throughput is 2.08E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7064s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7016s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.87E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -412,43 +560,43 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602021089631E-002 OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.194063e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.153365e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.598738e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.922960e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.245461e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.732117e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.916171e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.451486e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.280594e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.736678e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.964470e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.069247e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.206983e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.733211e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.552281e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.156375e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 46de15fd70..657075d34f 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-03_16:49:44 +DATE: 2024-03-01_03:36:18 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.7329s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7043s - [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8052s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7640s + [COUNTERS] Fortran MEs ( 1 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3307s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3021s - [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3849s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3442s + [COUNTERS] Fortran MEs ( 1 ) : 0.0408s for 8192 events => throughput is 2.01E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3429s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0301s - [COUNTERS] Fortran MEs ( 1 ) : 0.3128s for 90112 events => throughput is 2.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6297s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1833s + [COUNTERS] Fortran MEs ( 1 ) : 0.4464s for 90112 events => throughput is 2.02E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3727s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3404s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0323s for 8192 events => throughput is 2.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4282s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3910s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0373s for 8192 events => throughput is 2.20E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756647) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989099] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4231s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0658s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3573s for 90112 events => throughput is 2.52E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6901s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4066s for 90112 events => throughput is 2.22E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989099) differ by less than 3E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.598891e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.207121e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.609518e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.224007e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756619] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3271s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3094s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0177s for 8192 events => throughput is 4.63E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3927s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 8192 events => throughput is 3.78E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756619) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989085] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2463s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0509s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1954s for 90112 events => throughput is 4.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4997s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2628s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2369s for 90112 events => throughput is 3.80E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989085) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989106) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.739258e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.699229e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.744593e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.772412e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3155s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3051s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0103s for 8192 events => throughput is 7.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.20E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1533s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0403s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 90112 events => throughput is 7.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4003s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2548s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1455s for 90112 events => throughput is 6.19E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989114) differ by less than 3E-14 (0.0) +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.234509e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.020313e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.326375e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.141769e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3737s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3616s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.75E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.3804s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2520s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1283s for 90112 events => throughput is 7.02E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.898875e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.924828e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3919s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3726s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 8192 events => throughput is 4.24E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756626) differ by less than 3E-14 (3.3306690738754696e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.5267s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3034s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2233s for 90112 events => throughput is 4.04E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.791161e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.782832e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,8 +505,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -366,13 +514,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.5794s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5787s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7828s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7823s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184803756640) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,8 +538,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -399,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3240s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3165s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.21E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6782s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6718s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -412,43 +560,43 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.429406e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.045663e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.049159e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.714246e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.777279e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.010596e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.754790e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.071675e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.779709e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.000853e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.941572e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.152555e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.764332e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.001515e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.142528e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.100234e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 243b746871..eb011c6697 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-03_16:50:10 +DATE: 2024-03-01_03:36:45 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.5896s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5611s - [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7779s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7373s + [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3375s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3089s - [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3479s + [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3431s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0309s - [COUNTERS] Fortran MEs ( 1 ) : 0.3122s for 90112 events => throughput is 2.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6449s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1962s + [COUNTERS] Fortran MEs ( 1 ) : 0.4487s for 90112 events => throughput is 2.01E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094178241446492] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094179780921394] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3491s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3217s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0274s for 8192 events => throughput is 2.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4205s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0345s for 8192 events => throughput is 2.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094178241446492) differ by less than 4E-4 (1.3934438314322506e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094179780921394) differ by less than 4E-4 (1.0665510541407741e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105686930681671] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105688579298537] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3609s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0592s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3017s for 90112 events => throughput is 2.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6592s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2787s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3805s for 90112 events => throughput is 2.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105686930681671) differ by less than 4E-4 (1.7724624157278157e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105688579298537) differ by less than 4E-4 (1.4224799227413598e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.115125e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.351307e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.119656e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.338637e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094176373190514] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094175850060040] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3197s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3069s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3859s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0148s for 8192 events => throughput is 5.54E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094176373190514) differ by less than 4E-4 (1.7901501314643298e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094175850060040) differ by less than 4E-4 (1.9012318908107062e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105685173093654] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684763984058] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1832s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0422s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1410s for 90112 events => throughput is 6.39E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4203s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2581s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1622s for 90112 events => throughput is 5.56E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105685173093654) differ by less than 4E-4 (2.1455782361901043e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684763984058) differ by less than 4E-4 (2.2324275217311396e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.830369e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.210465e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.554914e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.317035e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094174474272364] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3070s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3008s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3679s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3602s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094174474272364) differ by less than 4E-4 (2.1933672500473733e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094173652938650) differ by less than 4E-4 (2.3677696170398832e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105684585116684] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1062s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0366s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0696s for 90112 events => throughput is 1.29E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3368s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2515s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0853s for 90112 events => throughput is 1.06E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105684585116684) differ by less than 4E-4 (2.2703990176786704e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ by less than 4E-4 (2.384278946498952e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.369839e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.038889e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.040818e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3685s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3613s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.13E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094173652938650) differ by less than 4E-4 (2.3677696170398832e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.3377s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2584s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0793s for 90112 events => throughput is 1.14E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ by less than 4E-4 (2.384278946498952e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.104729e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.384682e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.124265e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094178213275804] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3621s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0105s for 8192 events => throughput is 7.77E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094178213275804) differ by less than 4E-4 (1.3994256109484127e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105688407939567] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.3760s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2629s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 90112 events => throughput is 7.97E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105688407939567) differ by less than 4E-4 (1.4588574703822133e-07) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.591310e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.407728e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094176770070867] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184344050284] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.5810s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5807s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.42E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7795s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7789s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.48E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094176770070867) differ by less than 4E-4 (1.705876382374072e-07) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184344050284) differ by less than 4E-4 (9.761425112664313e-09) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105687115703695] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105694586476879] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3190s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3154s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 90112 events => throughput is 2.50E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7005s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6948s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0057s for 90112 events => throughput is 1.57E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105687115703695) differ by less than 4E-4 (1.733184357144424e-07) +OK! xsec from fortran (47.105695279989114) and cpp (47.105694586476879) differ by less than 4E-4 (1.4722471020078842e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.036278e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.201563e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.234773e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.986974e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.078030e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.810580e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.031314e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.774762e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.091165e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.802177e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.108729e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.847890e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.168153e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.368745e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.508893e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.422351e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index d4f3b3626b..bef66309f6 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-03_16:50:35 +DATE: 2024-03-01_03:37:12 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.5916s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5631s - [COUNTERS] Fortran MEs ( 1 ) : 0.0285s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7917s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7505s + [COUNTERS] Fortran MEs ( 1 ) : 0.0413s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=0 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3206s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2920s - [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3956s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] Fortran MEs ( 1 ) : 0.0410s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3443s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0322s - [COUNTERS] Fortran MEs ( 1 ) : 0.3122s for 90112 events => throughput is 2.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6496s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1993s + [COUNTERS] Fortran MEs ( 1 ) : 0.4503s for 90112 events => throughput is 2.00E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,13 +134,13 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3604s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3274s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3880s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0377s for 8192 events => throughput is 2.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428942997143e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428720952538e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4306s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0684s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3622s for 90112 events => throughput is 2.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6990s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2867s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4123s for 90112 events => throughput is 2.19E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006634) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.541809e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.185122e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.553175e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.177902e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186141863908] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3293s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3114s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0179s for 8192 events => throughput is 4.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3989s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3779s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0210s for 8192 events => throughput is 3.91E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094186141863908) differ by less than 2E-4 (2.8413429165041748e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186141863901) differ by less than 2E-4 (2.8413428720952538e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2460s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0575s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1884s for 90112 events => throughput is 4.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5038s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2721s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2317s for 90112 events => throughput is 3.89E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006626) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.839969e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.744718e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.879227e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.796645e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094186193208834] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3118s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3018s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3789s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3661s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.40E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094186193208834) differ by less than 2E-4 (2.9503689491505725e-08) +OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105696667630852] fbridge_mode=1 + [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1520s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0417s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1103s for 90112 events => throughput is 8.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4021s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2588s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1433s for 90112 events => throughput is 6.29E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.105695279989114) and cpp (47.105696667630852) differ by less than 2E-4 (2.9458046002517335e-08) +OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.436276e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.012402e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.512312e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.056070e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3770s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3652s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0118s for 8192 events => throughput is 6.94E+05 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.3844s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2567s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1278s for 90112 events => throughput is 7.05E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.957699e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.976096e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.3910s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3720s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0190s for 8192 events => throughput is 4.32E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094186169585456) differ by less than 2E-4 (2.9002069412698006e-08) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.4767s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2708s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2059s for 90112 events => throughput is 4.38E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ by less than 2E-4 (2.9364318976377035e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.223304e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.269412e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184798437837] fbridge_mode=1 + [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.5801s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5794s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.13E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7843s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7837s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.37E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.094184803756626) and cpp (47.094184798437837) differ by less than 2E-4 (1.1293943558143837e-10) +OK! xsec from fortran (47.094184803756640) and cpp (47.094184798437830) differ by less than 2E-4 (1.1293987967064822e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,8 +538,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -399,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3264s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3189s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.20E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6837s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6773s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.40E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -412,43 +560,43 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279068492) differ b OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.598819e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.090244e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.054804e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.672934e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.803359e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.997070e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.800501e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.055834e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.773858e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.991192e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.012601e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.134835e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.783351e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.012024e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.162857e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.999333e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 1095013ce9..cd3823dd45 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-03_16:51:01 +DATE: 2024-03-01_03:37:39 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.5655s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3630s - [COUNTERS] Fortran MEs ( 1 ) : 0.2025s for 8192 events => throughput is 4.05E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6990s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3830s + [COUNTERS] Fortran MEs ( 1 ) : 0.3160s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4596s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2570s - [COUNTERS] Fortran MEs ( 1 ) : 0.2025s for 8192 events => throughput is 4.04E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3119s + [COUNTERS] Fortran MEs ( 1 ) : 0.3167s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 3.4323s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2043s - [COUNTERS] Fortran MEs ( 1 ) : 2.2280s for 90112 events => throughput is 4.04E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 4.9846s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4835s + [COUNTERS] Fortran MEs ( 1 ) : 3.5010s for 90112 events => throughput is 2.57E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354763] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.8310s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5506s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2804s for 8192 events => throughput is 2.92E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0129s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6673s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3457s for 8192 events => throughput is 2.37E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354763) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252514E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 4.5869s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5062s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.0808s for 90112 events => throughput is 2.92E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.3959s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8159s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5801s for 90112 events => throughput is 2.52E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252514E-002) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002) differ by less than 3E-14 (6.661338147750939e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.982970e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.608629e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.993921e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.585600e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354515] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3955s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1340s for 8192 events => throughput is 6.11E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6456s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4771s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1686s for 8192 events => throughput is 4.86E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354515) differ by less than 3E-14 (2.475797344914099e-14) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607748863) differ by less than 3E-14 (2.453592884421596e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252514E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 2.8244s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3455s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4789s for 90112 events => throughput is 6.09E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 3.4996s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6435s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8561s for 90112 events => throughput is 4.85E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252514E-002) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.226693e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.966202e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.234741e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.937901e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354763] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3931s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3261s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0670s for 8192 events => throughput is 1.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4803s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3957s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0846s for 8192 events => throughput is 9.68E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354763) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 2.0129s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2766s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7364s for 90112 events => throughput is 1.22E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.5052s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5758s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9294s for 90112 events => throughput is 9.70E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252555E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.295404e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.913999e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.849874e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.4790s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3971s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0818s for 8192 events => throughput is 1.00E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.3870s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5613s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8257s for 90112 events => throughput is 1.09E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.099230e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.290087e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.125635e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.5283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4199s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1084s for 8192 events => throughput is 7.55E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749110) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.7535s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5980s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1555s for 90112 events => throughput is 7.80E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002) differ by less than 3E-14 (0.0) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.774058e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.841638e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354760] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5889s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5814s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7462s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7408s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317668354760) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749111) differ by less than 3E-14 (0.0) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.5896s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5073s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0823s for 90112 events => throughput is 1.09E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717736E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9272s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9044s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236471252555E-002) differ by less than 3E-14 (0.0) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717736E-002) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.143071e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.632538e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.166208e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.097542e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.672252e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.673182e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.305892e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.241730e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.673543e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.666883e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.841579e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.250394e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.660325e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.680746e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.409698e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.758368e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 6dbd1892b1..b22193f403 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none + +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-03_16:51:40 +DATE: 2024-03-01_03:38:22 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s - [COUNTERS] Fortran MEs ( 1 ) : 0.2024s for 8192 events => throughput is 4.05E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6628s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3454s + [COUNTERS] Fortran MEs ( 1 ) : 0.3174s for 8192 events => throughput is 2.58E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4634s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2604s - [COUNTERS] Fortran MEs ( 1 ) : 0.2030s for 8192 events => throughput is 4.04E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3130s + [COUNTERS] Fortran MEs ( 1 ) : 0.3165s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 3.4305s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2012s - [COUNTERS] Fortran MEs ( 1 ) : 2.2293s for 90112 events => throughput is 4.04E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 4.9722s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s + [COUNTERS] Fortran MEs ( 1 ) : 3.4891s for 90112 events => throughput is 2.58E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112291597608296] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112722621426752] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7637s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5087s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2550s for 8192 events => throughput is 3.21E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9336s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6187s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3149s for 8192 events => throughput is 2.60E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112291597608296) differ by less than 4E-4 (2.5781178285555484e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112722621426752) differ by less than 4E-4 (2.569659680817793e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239221732791437E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 4.2741s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4554s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.8186s for 90112 events => throughput is 3.20E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238468310179624E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.3385s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8107s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5278s for 90112 events => throughput is 2.55E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239221732791437E-002) differ by less than 4E-4 (1.8599953477416165e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238468310179624E-002) differ by less than 4E-4 (1.719182115555995e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.320467e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.649087e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.329030e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.678753e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112290421591680] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112720710186394] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4095s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3340s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0755s for 8192 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4987s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4044s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0944s for 8192 events => throughput is 8.68E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112290421591680) differ by less than 4E-4 (2.6944132867079418e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112720710186394) differ by less than 4E-4 (2.758652844936371e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239212368085274E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 2.1111s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2830s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8281s for 90112 events => throughput is 1.09E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238454786658835E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.5977s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5622s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0354s for 90112 events => throughput is 8.70E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239212368085274E-002) differ by less than 4E-4 (3.0418222529693395e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238454786658835E-002) differ by less than 4E-4 (3.4258681169685445e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.109626e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.791493e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108708e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.818254e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112291415112837] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3307s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2966s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0341s for 8192 events => throughput is 2.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4003s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3576s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112291415112837) differ by less than 4E-4 (2.5961646764605106e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721766950902) differ by less than 4E-4 (2.654154597325764e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239211617250407E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.6130s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2385s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3745s for 90112 events => throughput is 2.41E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9911s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4755s for 90112 events => throughput is 1.90E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239211617250407E-002) differ by less than 4E-4 (3.136577692020026e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002) differ by less than 4E-4 (3.5585866953180556e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.450541e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.915431e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.928091e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.3910s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3509s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0400s for 8192 events => throughput is 2.05E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112721766950902) differ by less than 4E-4 (2.654154597325764e-06) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9540s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5218s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4322s for 90112 events => throughput is 2.09E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002) differ by less than 4E-4 (3.5585866953180556e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.113903e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.471338e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.128293e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112723387847480] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.4175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3658s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0516s for 8192 events => throughput is 1.59E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112723387847480) differ by less than 4E-4 (2.4938721023826105e-06) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238464410949921E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.0938s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5607s for 90112 events => throughput is 1.61E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464410949921E-002) differ by less than 4E-4 (2.211270000440635e-07) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.580486e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.544942e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112292787307366] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112726034625694] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5768s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5749s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7476s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7467s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.72E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112292787307366) differ by less than 4E-4 (2.4604693221741414e-06) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112726034625694) differ by less than 4E-4 (2.2321452152196386e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239222545537072E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.5198s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4981s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 90112 events => throughput is 4.15E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9238473828077680E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9141s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9047s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 90112 events => throughput is 9.57E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239222545537072E-002) differ by less than 4E-4 (1.7574267630049434e-07) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238473828077680E-002) differ by less than 4E-4 (1.0228161673175862e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.727519e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.317603e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.649552e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.855249e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.471880e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.653705e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.087978e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.471958e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.472857e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.666794e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.639473e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.507869e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.429181e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.515295e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.261281e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.625829e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 6d86d2497e..994bc4f8f2 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-03_16:52:15 +DATE: 2024-03-01_03:39:00 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4851s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2827s - [COUNTERS] Fortran MEs ( 1 ) : 0.2023s for 8192 events => throughput is 4.05E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3573s + [COUNTERS] Fortran MEs ( 1 ) : 0.3178s for 8192 events => throughput is 2.58E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317668354764] fbridge_mode=0 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4619s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2590s - [COUNTERS] Fortran MEs ( 1 ) : 0.2029s for 8192 events => throughput is 4.04E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3205s + [COUNTERS] Fortran MEs ( 1 ) : 0.3183s for 8192 events => throughput is 2.57E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236471252555E-002] fbridge_mode=0 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 3.4326s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2053s - [COUNTERS] Fortran MEs ( 1 ) : 2.2273s for 90112 events => throughput is 4.05E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.0099s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5073s + [COUNTERS] Fortran MEs ( 1 ) : 3.5026s for 90112 events => throughput is 2.57E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317761225882] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.8269s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5407s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2862s for 8192 events => throughput is 2.86E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9635s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6336s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3299s for 8192 events => throughput is 2.48E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317761225882) differ by less than 2E-4 (9.183959592817814e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748700702684) differ by less than 2E-4 (9.191721828116783e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239237217958461E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 4.6470s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4909s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.1561s for 90112 events => throughput is 2.86E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 5.4154s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7958s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.6196s for 90112 events => throughput is 2.49E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237217958461E-002) differ by less than 2E-4 (9.4234364755863e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482679400354E-002) differ by less than 2E-4 (9.423232416594374e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.921904e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.562106e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.929545e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.547562e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317763556192] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748702805033] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5310s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3947s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1363s for 8192 events => throughput is 6.01E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6394s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1654s for 8192 events => throughput is 4.95E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317763556192) differ by less than 2E-4 (9.41440236879032e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748702805033) differ by less than 2E-4 (9.399612865834683e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239237221421968E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 2.8380s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3410s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4970s for 90112 events => throughput is 6.02E+04 events/s + [XSECTION] Cross section = 0.07924 [7.9238482683055667E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 3.4743s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6457s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8285s for 90112 events => throughput is 4.93E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237221421968E-002) differ by less than 2E-4 (9.467145956065792e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482683055667E-002) differ by less than 2E-4 (9.469362849401364e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.207090e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.063467e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.204637e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.051938e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317741957558] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3879s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3235s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0644s for 8192 events => throughput is 1.27E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4771s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3935s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 8192 events => throughput is 9.80E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317741957558) differ by less than 2E-4 (7.278528668663853e-09) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415580) differ by less than 2E-4 (7.284514991212632e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239237072275287E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.9796s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2698s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7097s for 90112 events => throughput is 1.27E+05 events/s + [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.4752s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5541s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9211s for 90112 events => throughput is 9.78E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239237072275287E-002) differ by less than 2E-4 (7.584913142011374e-09) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002) differ by less than 2E-4 (7.592642958798024e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.309433e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.001861e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.840887e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.4581s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3846s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0735s for 8192 events => throughput is 1.11E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748681415580) differ by less than 2E-4 (7.284514991212632e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.3626s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5499s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8127s for 90112 events => throughput is 1.11E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002) differ by less than 2E-4 (7.592642958798024e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.148207e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.313768e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.152825e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748700265108] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.5403s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1136s for 8192 events => throughput is 7.21E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748700265108) differ by less than 2E-4 (9.148451995955043e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238482666076374E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 2.7973s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6097s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1876s for 90112 events => throughput is 7.59E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482666076374E-002) differ by less than 2E-4 (9.255082034087536e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.609614e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.592843e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112317662375726] fbridge_mode=1 + [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5717s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5642s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7459s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7405s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.10112317668354764) and cpp (0.10112317662375726) differ by less than 2E-4 (5.9126292750733e-10) +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748601943165) differ by less than 2E-4 (5.74121417074025e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9239236476482192E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1899 events (found 1904 events) - [COUNTERS] PROGRAM TOTAL : 1.5944s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5120s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0824s for 90112 events => throughput is 1.09E+06 events/s + [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9191s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8964s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.96E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (7.9239236471252555E-002) and cpp (7.9239236476482192E-002) differ by less than 2E-4 (6.599809587726213e-11) +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481937154381E-002) differ by less than 2E-4 (5.5991211667105745e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.137679e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.624489e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.166979e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.862423e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.672913e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.598562e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.303219e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.230160e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.672319e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.604858e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.840496e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.241022e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.661363e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.618302e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.406189e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.712384e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index a0a2aa6349..455a867420 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-03_16:52:53 +DATE: 2024-03-01_03:39:43 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 2.8610s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3678s - [COUNTERS] Fortran MEs ( 1 ) : 2.4932s for 8192 events => throughput is 3.29E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5262s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3968s + [COUNTERS] Fortran MEs ( 1 ) : 4.1295s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2756s - [COUNTERS] Fortran MEs ( 1 ) : 2.4994s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4601s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3420s + [COUNTERS] Fortran MEs ( 1 ) : 4.1180s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 29.0676s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5877s - [COUNTERS] Fortran MEs ( 1 ) : 27.4799s for 90112 events => throughput is 3.28E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 47.7126s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0517s + [COUNTERS] Fortran MEs ( 1 ) : 45.6608s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579739E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 7.7622s - [COUNTERS] Fortran Overhead ( 0 ) : 3.9898s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7724s for 8192 events => throughput is 2.17E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7056s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4601s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2455s for 8192 events => throughput is 1.93E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579739E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102372E-004) differ by less than 3E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 46.7317s - [COUNTERS] Fortran Overhead ( 0 ) : 5.2704s - [COUNTERS] CudaCpp MEs ( 2 ) : 41.4613s for 90112 events => throughput is 2.17E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 53.1561s + [COUNTERS] Fortran Overhead ( 0 ) : 6.1171s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.0390s for 90112 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451704E-004) differ by less than 3E-14 (5.551115123125783e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.255639e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.989312e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.251508e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.975004e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579739E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.5765s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9114s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.6651s for 8192 events => throughput is 4.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7773s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5170s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2604s for 8192 events => throughput is 3.62E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579739E-004) differ by less than 3E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 21.5394s - [COUNTERS] Fortran Overhead ( 0 ) : 3.2414s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.2979s for 90112 events => throughput is 4.92E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451701E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 29.0103s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1559s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.8544s for 90112 events => throughput is 3.63E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451701E-004) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.088805e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.801009e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.094091e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.781734e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579728E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.6557s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9594s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6963s for 8192 events => throughput is 1.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.2569s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2848s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9721s for 8192 events => throughput is 8.43E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579728E-004) differ by less than 3E-14 (4.440892098500626e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914648E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 9.9655s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2947s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.6708s for 90112 events => throughput is 1.17E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 13.7501s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9426s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.8075s for 90112 events => throughput is 8.34E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914648E-004) differ by less than 3E-14 (5.551115123125783e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.213870e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.607758e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.615061e+03 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.0290s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8583s for 8192 events => throughput is 9.54E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 12.2922s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8302s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4620s for 90112 events => throughput is 9.52E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.701965e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.218553e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.814187e+03 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.5040s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4024s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1017s for 8192 events => throughput is 7.44E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 14.7910s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0412s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.7498s for 90112 events => throughput is 7.67E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.831586e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.821061e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579723E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.9784s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8654s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 8192 events => throughput is 7.25E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8686s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8368s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.57E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143122579723E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914653E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 3.3306s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0862s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2445s for 90112 events => throughput is 7.24E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.8233s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4732s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3501s for 90112 events => throughput is 2.57E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411914653E-004) differ by less than 3E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.295001e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.280922e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.519639e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.518844e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.249001e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.106750e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.036607e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.162850e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.239941e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.106625e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.231726e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.168282e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.244867e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.107369e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.391147e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.430988e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index acf204f2f8..5e945a4db8 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-03_16:56:42 +DATE: 2024-03-01_03:43:57 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 2.7942s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2798s - [COUNTERS] Fortran MEs ( 1 ) : 2.5144s for 8192 events => throughput is 3.26E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4989s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3403s + [COUNTERS] Fortran MEs ( 1 ) : 4.1586s for 8192 events => throughput is 1.97E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7719s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2755s - [COUNTERS] Fortran MEs ( 1 ) : 2.4964s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4650s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3366s + [COUNTERS] Fortran MEs ( 1 ) : 4.1284s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 29.0770s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5941s - [COUNTERS] Fortran MEs ( 1 ) : 27.4828s for 90112 events => throughput is 3.28E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 47.5707s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0343s + [COUNTERS] Fortran MEs ( 1 ) : 45.5364s for 90112 events => throughput is 1.98E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704259755238570E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703729438336302E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 6.9402s - [COUNTERS] Fortran Overhead ( 0 ) : 3.5788s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.3614s for 8192 events => throughput is 2.44E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.4568s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3045s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1523s for 8192 events => throughput is 1.97E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704259755238570E-004) differ by less than 4E-4 (3.0134411834747965e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703729438336302E-004) differ by less than 4E-4 (3.021119383106452e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793580182117605E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 41.8891s - [COUNTERS] Fortran Overhead ( 0 ) : 4.9220s - [COUNTERS] CudaCpp MEs ( 2 ) : 36.9671s for 90112 events => throughput is 2.44E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793486626492658E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 51.1261s + [COUNTERS] Fortran Overhead ( 0 ) : 5.9844s + [COUNTERS] CudaCpp MEs ( 2 ) : 45.1417s for 90112 events => throughput is 2.00E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793580182117605E-004) differ by less than 4E-4 (3.024668687290344e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486626492658E-004) differ by less than 4E-4 (3.0382263187522796e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.491248e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.070377e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.495341e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.032691e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704254541054809E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703722581317850E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.9396s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1020s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8376s for 8192 events => throughput is 9.78E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5531s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4379s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1153s for 8192 events => throughput is 7.35E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704254541054809E-004) differ by less than 4E-4 (2.8787221757475834e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722581317850E-004) differ by less than 4E-4 (2.843951981690296e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793578161882866E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 11.5930s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4020s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.1910s for 90112 events => throughput is 9.80E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793483759856148E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 15.4011s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1124s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.2887s for 90112 events => throughput is 7.33E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793578161882866E-004) differ by less than 4E-4 (2.896753368286653e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483759856148E-004) differ by less than 4E-4 (2.856718252175483e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.009110e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.468143e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.004329e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.493623e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704254166302247E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.9791s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6276s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3515s for 8192 events => throughput is 2.33E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3122s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8184s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4938s for 8192 events => throughput is 1.66E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704254166302247E-004) differ by less than 4E-4 (2.8690396836061893e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722425602170E-004) differ by less than 4E-4 (2.8399286962077497e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793578009696313E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 5.8051s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9601s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8450s for 90112 events => throughput is 2.34E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 7.8863s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4589s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4274s for 90112 events => throughput is 1.66E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793578009696313E-004) differ by less than 4E-4 (2.887117363403746e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004) differ by less than 4E-4 (2.852825495613942e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.420571e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.689224e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.712522e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 1.1887s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7547s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4340s for 8192 events => throughput is 1.89E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703722425602170E-004) differ by less than 4E-4 (2.8399286962077497e-06) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 7.2113s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4166s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7946s for 90112 events => throughput is 1.88E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004) differ by less than 4E-4 (2.852825495613942e-06) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.812765e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.416575e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.800388e+04 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703728658657426E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 1.4119s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8827s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5292s for 8192 events => throughput is 1.55E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703728658657426E-004) differ by less than 4E-4 (3.0009745224379714e-06) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793486977281547E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 8.3753s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5229s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.8525s for 90112 events => throughput is 1.54E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486977281547E-004) differ by less than 4E-4 (3.0604373708609245e-06) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.556546e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.565832e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704261630635685E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703736267486325E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8636s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8082s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0555s for 8192 events => throughput is 1.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8334s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8120s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.82E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704261630635685E-004) differ by less than 4E-4 (3.0618958697381515e-06) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703736267486325E-004) differ by less than 4E-4 (3.1975667371675343e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793580869662166E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 2.6027s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9914s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6113s for 90112 events => throughput is 1.47E+05 events/s + [XSECTION] Cross section = 0.0001579 [1.5793489323670813E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.7017s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4654s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2363s for 90112 events => throughput is 3.81E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793580869662166E-004) differ by less than 4E-4 (3.0682019858119247e-06) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793489323670813E-004) differ by less than 4E-4 (3.20900471706409e-06) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.492169e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.592263e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.854767e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.940482e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.716188e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.499807e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.317447e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.638317e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.712158e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.497540e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.068831e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.635301e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.710239e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.483569e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.426106e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.518477e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 26322b0196..4a1ef98d00 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-03_16:59:48 +DATE: 2024-03-01_03:47:17 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 2.7778s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2789s - [COUNTERS] Fortran MEs ( 1 ) : 2.4990s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4720s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3418s + [COUNTERS] Fortran MEs ( 1 ) : 4.1302s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143122579712E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.7715s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2749s - [COUNTERS] Fortran MEs ( 1 ) : 2.4966s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3357s + [COUNTERS] Fortran MEs ( 1 ) : 4.1229s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411914656E-004] fbridge_mode=0 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 29.0872s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6251s - [COUNTERS] Fortran MEs ( 1 ) : 27.4620s for 90112 events => throughput is 3.28E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 47.6222s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0494s + [COUNTERS] Fortran MEs ( 1 ) : 45.5728s for 90112 events => throughput is 1.98E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143272044121E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612659176674E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 7.7541s - [COUNTERS] Fortran Overhead ( 0 ) : 3.9555s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7985s for 8192 events => throughput is 2.16E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7912s + [COUNTERS] Fortran Overhead ( 0 ) : 4.5114s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2799s for 8192 events => throughput is 1.91E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143272044121E-004) differ by less than 2E-4 (3.861716058040088e-09) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612659176674E-004) differ by less than 2E-4 (3.851690077993908e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532474032691E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 47.0949s - [COUNTERS] Fortran Overhead ( 0 ) : 5.2971s - [COUNTERS] CudaCpp MEs ( 2 ) : 41.7978s for 90112 events => throughput is 2.16E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438704534934E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 53.4090s + [COUNTERS] Fortran Overhead ( 0 ) : 6.1734s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.2356s for 90112 events => throughput is 1.91E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532474032691E-004) differ by less than 2E-4 (3.933131154099101e-09) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438704534934E-004) differ by less than 2E-4 (3.930950231989527e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.227302e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.968066e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.229561e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.968245e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143304774347E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612692816703E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.5587s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9184s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.6403s for 8192 events => throughput is 4.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7232s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5040s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2192s for 8192 events => throughput is 3.69E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143304774347E-004) differ by less than 2E-4 (4.707367828871156e-09) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612692816703E-004) differ by less than 2E-4 (4.720860369289426e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532476698221E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 21.2170s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1859s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.0311s for 90112 events => throughput is 5.00E+03 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438707226035E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 28.6711s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1739s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.4972s for 90112 events => throughput is 3.68E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532476698221E-004) differ by less than 2E-4 (4.101904815811963e-09) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438707226035E-004) differ by less than 2E-4 (4.1013439311399225e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.158008e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.727620e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.137988e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.685802e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143287857844E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.6233s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9443s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6790s for 8192 events => throughput is 1.21E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.2625s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2738s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9887s for 8192 events => throughput is 8.29E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143287857844E-004) differ by less than 2E-4 (4.2702956726259345e-09) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532473043530E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 9.7193s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2475s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.4718s for 90112 events => throughput is 1.21E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 13.6031s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9396s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.6635s for 90112 events => throughput is 8.45E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532473043530E-004) differ by less than 2E-4 (3.870500364655527e-09) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.231500e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.715236e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.685374e+03 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.0253s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1676s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8577s for 8192 events => throughput is 9.55E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 12.2295s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8222s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4074s for 90112 events => throughput is 9.58E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.886999e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.231333e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.910216e+03 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 2.4883s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4086s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0796s for 8192 events => throughput is 7.59E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612675240517E-004) differ by less than 2E-4 (4.266737629876616e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 15.1764s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0860s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.0904s for 90112 events => throughput is 7.45E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004) differ by less than 2E-4 (3.873764420347925e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.643781e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.679757e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8704143124638075E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000387 [3.8703612512203166E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8875s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7736s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1139s for 8192 events => throughput is 7.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8696s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8376s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0321s for 8192 events => throughput is 2.56E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.8704143122579712E-004) and cpp (3.8704143124638075E-004) differ by less than 2E-4 (5.318190332559425e-11) +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612512203166E-004) differ by less than 2E-4 (5.427946980773868e-11) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793532411887058E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1358 events (found 1880 events) - [COUNTERS] PROGRAM TOTAL : 3.3445s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0956s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2489s for 90112 events => throughput is 7.22E+04 events/s + [XSECTION] Cross section = 0.0001579 [1.5793438642387717E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.8559s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5071s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3489s for 90112 events => throughput is 2.58E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5793532411914656E-004) and cpp (1.5793532411887058E-004) differ by less than 2E-4 (1.7474910407599964e-12) +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642387717E-004) differ by less than 2E-4 (4.051980972974434e-12) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.300806e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.289596e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.521772e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.528638e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.248268e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.112086e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.028200e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.149032e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.248122e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.114551e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232629e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.167728e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.249568e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.109912e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.379759e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.430504e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 88cac494c1..6ba33cd625 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-03_16:54:11 +DATE: 2024-03-01_03:53:01 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.6134s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4697s - [COUNTERS] Fortran MEs ( 1 ) : 54.1437s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 96.0689s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5400s + [COUNTERS] Fortran MEs ( 1 ) : 95.5289s for 8192 events => throughput is 8.58E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.6041s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4623s - [COUNTERS] Fortran MEs ( 1 ) : 54.1418s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 96.2818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4823s + [COUNTERS] Fortran MEs ( 1 ) : 95.7994s for 8192 events => throughput is 8.55E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 598.7866s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0634s - [COUNTERS] Fortran MEs ( 1 ) : 595.7233s for 90112 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1058.3505s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1547s + [COUNTERS] Fortran MEs ( 1 ) : 1054.1958s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015001E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 174.5096s - [COUNTERS] Fortran Overhead ( 0 ) : 79.6890s - [COUNTERS] CudaCpp MEs ( 2 ) : 94.8205s for 8192 events => throughput is 8.64E+01 events/s + [COUNTERS] PROGRAM TOTAL : 212.3366s + [COUNTERS] Fortran Overhead ( 0 ) : 99.0477s + [COUNTERS] CudaCpp MEs ( 2 ) : 113.2889s for 8192 events => throughput is 7.23E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015001E-006) differ by less than 3E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939193E-006) differ by less than 3E-14 (1.5543122344752192e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085453E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1131.3065s - [COUNTERS] Fortran Overhead ( 0 ) : 82.0836s - [COUNTERS] CudaCpp MEs ( 2 ) : 1049.2229s for 90112 events => throughput is 8.59E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1356.0370s + [COUNTERS] Fortran Overhead ( 0 ) : 104.1787s + [COUNTERS] CudaCpp MEs ( 2 ) : 1251.8583s for 90112 events => throughput is 7.20E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085453E-007) differ by less than 3E-14 (1.5543122344752192e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.035526e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.154156e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.037699e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.197434e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015003E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939197E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 81.7893s - [COUNTERS] Fortran Overhead ( 0 ) : 36.9957s - [COUNTERS] CudaCpp MEs ( 2 ) : 44.7936s for 8192 events => throughput is 1.83E+02 events/s + [COUNTERS] PROGRAM TOTAL : 107.3498s + [COUNTERS] Fortran Overhead ( 0 ) : 49.5738s + [COUNTERS] CudaCpp MEs ( 2 ) : 57.7759s for 8192 events => throughput is 1.42E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015003E-006) differ by less than 3E-14 (2.6645352591003757e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939197E-006) differ by less than 3E-14 (1.7763568394002505e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085448E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656017E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 531.7217s - [COUNTERS] Fortran Overhead ( 0 ) : 39.5074s - [COUNTERS] CudaCpp MEs ( 2 ) : 492.2143s for 90112 events => throughput is 1.83E+02 events/s + [COUNTERS] PROGRAM TOTAL : 690.9132s + [COUNTERS] Fortran Overhead ( 0 ) : 53.4647s + [COUNTERS] CudaCpp MEs ( 2 ) : 637.4485s for 90112 events => throughput is 1.41E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085448E-007) differ by less than 3E-14 (1.3322676295501878e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656017E-007) differ by less than 3E-14 (2.220446049250313e-15) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.258682e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.672791e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.250805e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.670748e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015001E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 35.2764s - [COUNTERS] Fortran Overhead ( 0 ) : 16.0692s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.2071s for 8192 events => throughput is 4.27E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.9431s + [COUNTERS] Fortran Overhead ( 0 ) : 23.2154s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.7277s for 8192 events => throughput is 3.06E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015001E-006) differ by less than 3E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085445E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 228.9946s - [COUNTERS] Fortran Overhead ( 0 ) : 18.5816s - [COUNTERS] CudaCpp MEs ( 2 ) : 210.4130s for 90112 events => throughput is 4.28E+02 events/s + [COUNTERS] PROGRAM TOTAL : 318.2044s + [COUNTERS] Fortran Overhead ( 0 ) : 26.8024s + [COUNTERS] CudaCpp MEs ( 2 ) : 291.4019s for 90112 events => throughput is 3.09E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085445E-007) differ by less than 3E-14 (1.1102230246251565e-15) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.282079e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.618074e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.618894e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 44.2064s + [COUNTERS] Fortran Overhead ( 0 ) : 20.3467s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.8597s for 8192 events => throughput is 3.43E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 291.1048s + [COUNTERS] Fortran Overhead ( 0 ) : 24.2318s + [COUNTERS] CudaCpp MEs ( 2 ) : 266.8729s for 90112 events => throughput is 3.38E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.097914e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.275234e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.125731e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 45.8566s + [COUNTERS] Fortran Overhead ( 0 ) : 22.2857s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.5710s for 8192 events => throughput is 3.48E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939191E-006) differ by less than 3E-14 (1.3322676295501878e-15) -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 285.1342s + [COUNTERS] Fortran Overhead ( 0 ) : 26.2120s + [COUNTERS] CudaCpp MEs ( 2 ) : 258.9222s for 90112 events => throughput is 3.48E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007) differ by less than 3E-14 (1.9984014443252818e-15) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.725410e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.772387e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942015003E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 11.7985s - [COUNTERS] Fortran Overhead ( 0 ) : 7.9861s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8124s for 8192 events => throughput is 2.15E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.2510s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1660s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0850s for 8192 events => throughput is 7.55E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628942015003E-006) differ by less than 3E-14 (2.6645352591003757e-15) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.7763568394002505e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085437E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 52.1048s - [COUNTERS] Fortran Overhead ( 0 ) : 10.1597s - [COUNTERS] CudaCpp MEs ( 2 ) : 41.9451s for 90112 events => throughput is 2.15E+03 events/s + [COUNTERS] PROGRAM TOTAL : 18.8198s + [COUNTERS] Fortran Overhead ( 0 ) : 6.9183s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9015s for 90112 events => throughput is 7.57E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783648085437E-007) differ by less than 3E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.7763568394002505e-15) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.179470e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.527080e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.192534e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.239391e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.561647e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.271267e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.458311e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.600243e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.554068e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.245889e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.511239e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.476521e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.545851e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.229131e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.123455e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.234312e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index d41090c8e2..2b7ca2c190 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-03_17:49:34 +DATE: 2024-03-01_05:18:49 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.5124s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3701s - [COUNTERS] Fortran MEs ( 1 ) : 54.1423s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 96.8320s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s + [COUNTERS] Fortran MEs ( 1 ) : 96.3581s for 8192 events => throughput is 8.50E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.5924s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4250s - [COUNTERS] Fortran MEs ( 1 ) : 54.1674s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 96.1294s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4800s + [COUNTERS] Fortran MEs ( 1 ) : 95.6494s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 598.2306s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0375s - [COUNTERS] Fortran MEs ( 1 ) : 595.1931s for 90112 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1058.3011s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1783s + [COUNTERS] Fortran MEs ( 1 ) : 1054.1228s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405363572559468E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405719957040752E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 161.5735s - [COUNTERS] Fortran Overhead ( 0 ) : 74.1579s - [COUNTERS] CudaCpp MEs ( 2 ) : 87.4157s for 8192 events => throughput is 9.37E+01 events/s + [COUNTERS] PROGRAM TOTAL : 197.7089s + [COUNTERS] Fortran Overhead ( 0 ) : 90.3714s + [COUNTERS] CudaCpp MEs ( 2 ) : 107.3375s for 8192 events => throughput is 7.63E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405363572559468E-006) differ by less than 4E-4 (0.00013984863241267576) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719957040752E-006) differ by less than 4E-4 (0.00013985256106807675) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326080615569212E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326290771198648E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1038.0662s - [COUNTERS] Fortran Overhead ( 0 ) : 76.7283s - [COUNTERS] CudaCpp MEs ( 2 ) : 961.3379s for 90112 events => throughput is 9.37E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1274.0074s + [COUNTERS] Fortran Overhead ( 0 ) : 94.0944s + [COUNTERS] CudaCpp MEs ( 2 ) : 1179.9131s for 90112 events => throughput is 7.64E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326080615569212E-007) differ by less than 4E-4 (0.00014136252059526733) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290771198648E-007) differ by less than 4E-4 (0.00014139199589124907) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.113338e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.108865e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111807e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.128078e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405361288903015E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405717007921116E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 39.5402s - [COUNTERS] Fortran Overhead ( 0 ) : 18.1739s - [COUNTERS] CudaCpp MEs ( 2 ) : 21.3663s for 8192 events => throughput is 3.83E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.6519s + [COUNTERS] Fortran Overhead ( 0 ) : 23.3946s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2573s for 8192 events => throughput is 3.12E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405361288903015E-006) differ by less than 4E-4 (0.0001396645204514435) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405717007921116E-006) differ by less than 4E-4 (0.00013961480525170877) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326076878598447E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326284900828787E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 255.2177s - [COUNTERS] Fortran Overhead ( 0 ) : 20.6553s - [COUNTERS] CudaCpp MEs ( 2 ) : 234.5624s for 90112 events => throughput is 3.84E+02 events/s + [COUNTERS] PROGRAM TOTAL : 315.8806s + [COUNTERS] Fortran Overhead ( 0 ) : 27.1593s + [COUNTERS] CudaCpp MEs ( 2 ) : 288.7213s for 90112 events => throughput is 3.12E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326076878598447E-007) differ by less than 4E-4 (0.00014120229226155523) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326284900828787E-007) differ by less than 4E-4 (0.00014114029707035236) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.647702e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.581780e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.648968e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.565199e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405360895331841E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 17.7758s - [COUNTERS] Fortran Overhead ( 0 ) : 8.2128s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.5629s for 8192 events => throughput is 8.57E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.4788s + [COUNTERS] Fortran Overhead ( 0 ) : 11.8981s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.5807s for 8192 events => throughput is 6.03E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405360895331841E-006) differ by less than 4E-4 (0.00013963279012663143) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716659252656E-006) differ by less than 4E-4 (0.00013958669586155992) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326069099562333E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 115.7341s - [COUNTERS] Fortran Overhead ( 0 ) : 10.7789s - [COUNTERS] CudaCpp MEs ( 2 ) : 104.9552s for 90112 events => throughput is 8.59E+02 events/s + [COUNTERS] PROGRAM TOTAL : 165.7549s + [COUNTERS] Fortran Overhead ( 0 ) : 15.4780s + [COUNTERS] CudaCpp MEs ( 2 ) : 150.2769s for 90112 events => throughput is 6.00E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326069099562333E-007) differ by less than 4E-4 (0.00014086875419705436) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007) differ by less than 4E-4 (0.00014080311959907554) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.061906e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.259920e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.259066e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 22.3180s + [COUNTERS] Fortran Overhead ( 0 ) : 10.3786s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9393s for 8192 events => throughput is 6.86E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405716659252656E-006) differ by less than 4E-4 (0.00013958669586155992) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 145.4310s + [COUNTERS] Fortran Overhead ( 0 ) : 14.1732s + [COUNTERS] CudaCpp MEs ( 2 ) : 131.2578s for 90112 events => throughput is 6.87E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007) differ by less than 4E-4 (0.00014080311959907554) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.296906e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.052058e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.301383e+02 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.241e-06 [1.2405719306052570E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 23.0558s + [COUNTERS] Fortran Overhead ( 0 ) : 11.3644s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.6914s for 8192 events => throughput is 7.01E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719306052570E-006) differ by less than 4E-4 (0.00013980007888836354) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.333e-07 [2.3326283660088769E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 144.1559s + [COUNTERS] Fortran Overhead ( 0 ) : 15.2893s + [COUNTERS] CudaCpp MEs ( 2 ) : 128.8666s for 90112 events => throughput is 6.99E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326283660088769E-007) differ by less than 4E-4 (0.00014108709892313165) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.554413e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.557969e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405363557292459E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405722175509512E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 6.2015s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3978s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8037s for 8192 events => throughput is 4.54E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.4934s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9950s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4985s for 8192 events => throughput is 1.64E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2405363557292459E-006) differ by less than 4E-4 (0.00013984740156258724) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405722175509512E-006) differ by less than 4E-4 (0.00014003141235829908) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326074784076956E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326296967941821E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 26.7982s - [COUNTERS] Fortran Overhead ( 0 ) : 6.8766s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.9216s for 90112 events => throughput is 4.52E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.1120s + [COUNTERS] Fortran Overhead ( 0 ) : 5.7089s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4031s for 90112 events => throughput is 1.67E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3326074784076956E-007) differ by less than 4E-4 (0.00014111248645076735) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326296967941821E-007) differ by less than 4E-4 (0.0001416576883412901) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.541018e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.650610e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.541711e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.632591e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.381750e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.339184e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.509825e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.373598e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.374728e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.323596e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.108824e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.361104e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.391979e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.325481e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.092226e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.425348e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 0ce4090d7a..99d7cfbcd5 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-03_18:33:43 +DATE: 2024-03-01_06:24:34 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.7954s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3694s - [COUNTERS] Fortran MEs ( 1 ) : 54.4260s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 96.2156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s + [COUNTERS] Fortran MEs ( 1 ) : 95.7357s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628942014972E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 54.5671s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3751s - [COUNTERS] Fortran MEs ( 1 ) : 54.1919s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 96.1318s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s + [COUNTERS] Fortran MEs ( 1 ) : 95.6519s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783648085419E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 598.9863s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0430s - [COUNTERS] Fortran MEs ( 1 ) : 595.9433s for 90112 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1057.5728s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1537s + [COUNTERS] Fortran MEs ( 1 ) : 1053.4191s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403629013416990E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985299359844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 175.2429s - [COUNTERS] Fortran Overhead ( 0 ) : 80.2048s - [COUNTERS] CudaCpp MEs ( 2 ) : 95.0382s for 8192 events => throughput is 8.62E+01 events/s + [COUNTERS] PROGRAM TOTAL : 220.4361s + [COUNTERS] Fortran Overhead ( 0 ) : 102.4490s + [COUNTERS] CudaCpp MEs ( 2 ) : 117.9870s for 8192 events => throughput is 6.94E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629013416990E-006) differ by less than 2E-4 (5.7565425759520394e-09) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985299359844E-006) differ by less than 2E-4 (5.7578810608305275e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783773791503E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1125.3684s - [COUNTERS] Fortran Overhead ( 0 ) : 82.6919s - [COUNTERS] CudaCpp MEs ( 2 ) : 1042.6765s for 90112 events => throughput is 8.64E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1422.8276s + [COUNTERS] Fortran Overhead ( 0 ) : 106.0198s + [COUNTERS] CudaCpp MEs ( 2 ) : 1316.8079s for 90112 events => throughput is 6.84E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783773791503E-007) differ by less than 2E-4 (5.389840573855054e-09) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993212353001E-007) differ by less than 2E-4 (5.389404034161771e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.023694e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.035940e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.034350e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.018960e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403629009850969E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985295828471E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 78.5615s - [COUNTERS] Fortran Overhead ( 0 ) : 35.1164s - [COUNTERS] CudaCpp MEs ( 2 ) : 43.4451s for 8192 events => throughput is 1.89E+02 events/s + [COUNTERS] PROGRAM TOTAL : 110.5022s + [COUNTERS] Fortran Overhead ( 0 ) : 50.8167s + [COUNTERS] CudaCpp MEs ( 2 ) : 59.6855s for 8192 events => throughput is 1.37E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629009850969E-006) differ by less than 2E-4 (5.469044328521022e-09) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985295828471E-006) differ by less than 2E-4 (5.473184350179849e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783784120318E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222645653E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 515.7699s - [COUNTERS] Fortran Overhead ( 0 ) : 37.8592s - [COUNTERS] CudaCpp MEs ( 2 ) : 477.9107s for 90112 events => throughput is 1.89E+02 events/s + [COUNTERS] PROGRAM TOTAL : 715.3882s + [COUNTERS] Fortran Overhead ( 0 ) : 54.5501s + [COUNTERS] CudaCpp MEs ( 2 ) : 660.8381s for 90112 events => throughput is 1.36E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783784120318E-007) differ by less than 2E-4 (5.832704319530535e-09) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222645653E-007) differ by less than 2E-4 (5.830713245558172e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.364562e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.628879e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.366448e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.636164e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403629007633195E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 34.2387s - [COUNTERS] Fortran Overhead ( 0 ) : 15.3454s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.8933s for 8192 events => throughput is 4.34E+02 events/s + [COUNTERS] PROGRAM TOTAL : 48.5744s + [COUNTERS] Fortran Overhead ( 0 ) : 22.1801s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.3943s for 8192 events => throughput is 3.10E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403629007633195E-006) differ by less than 2E-4 (5.290244020628165e-09) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783783946155E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 227.2672s - [COUNTERS] Fortran Overhead ( 0 ) : 17.8656s - [COUNTERS] CudaCpp MEs ( 2 ) : 209.4016s for 90112 events => throughput is 4.30E+02 events/s + [COUNTERS] PROGRAM TOTAL : 319.2663s + [COUNTERS] Fortran Overhead ( 0 ) : 26.0078s + [COUNTERS] CudaCpp MEs ( 2 ) : 293.2585s for 90112 events => throughput is 3.07E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783783946155E-007) differ by less than 2E-4 (5.825236737422301e-09) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.492842e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.764546e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.773101e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 42.4540s + [COUNTERS] Fortran Overhead ( 0 ) : 19.2743s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.1797s for 8192 events => throughput is 3.53E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 277.3470s + [COUNTERS] Fortran Overhead ( 0 ) : 22.9193s + [COUNTERS] CudaCpp MEs ( 2 ) : 254.4277s for 90112 events => throughput is 3.54E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.384820e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.491012e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.391539e+02 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 45.2143s + [COUNTERS] Fortran Overhead ( 0 ) : 21.9553s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.2589s for 8192 events => throughput is 3.52E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985293629285E-006) differ by less than 2E-4 (5.29588750630694e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 278.0679s + [COUNTERS] Fortran Overhead ( 0 ) : 25.4000s + [COUNTERS] CudaCpp MEs ( 2 ) : 252.6680s for 90112 events => throughput is 3.57E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007) differ by less than 2E-4 (5.822204496297445e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.828727e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.858416e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403628931370709E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 12.2202s - [COUNTERS] Fortran Overhead ( 0 ) : 8.0547s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.1655s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5884s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7239s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8645s for 8192 events => throughput is 9.48E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403628942014972E-006) and cpp (1.2403628931370709E-006) differ by less than 2E-4 (8.581571009358413e-10) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985217419736E-006) differ by less than 2E-4 (8.480691704448873e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322783640044522E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.332e-07 [2.3322993078576733E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 56.4471s - [COUNTERS] Fortran Overhead ( 0 ) : 10.5120s - [COUNTERS] CudaCpp MEs ( 2 ) : 45.9352s for 90112 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.9902s + [COUNTERS] Fortran Overhead ( 0 ) : 6.4881s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.5020s for 90112 events => throughput is 9.48E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322783648085419E-007) and cpp (2.3322783640044522E-007) differ by less than 2E-4 (3.447657714872321e-10) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993078576733E-007) differ by less than 2E-4 (3.464063480507207e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.973675e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.411937e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.002670e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.083264e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.263024e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.112113e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.394768e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.161038e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.327371e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.111465e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.281455e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.105445e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.327509e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.112837e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.081275e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.656493e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 918e7c2a67..8e9ad5ba7a 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-03_17:03:36 +DATE: 2024-03-01_03:51:32 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4547s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4066s - [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4944s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s + [COUNTERS] Fortran MEs ( 1 ) : 0.0697s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3243s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2762s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3864s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3169s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7846s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2591s - [COUNTERS] Fortran MEs ( 1 ) : 0.5255s for 90112 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2522s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4897s + [COUNTERS] Fortran MEs ( 1 ) : 0.7625s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4354s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3662s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0693s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4681s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3922s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0759s for 8192 events => throughput is 1.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166922) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263335) differ by less than 3E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343820] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561293] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0569s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2923s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7647s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3698s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5419s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8279s for 90112 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343820) differ by less than 3E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561293) differ by less than 3E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.204533e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.084897e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.208719e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.103096e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166122] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351262530] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3320s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2986s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0335s for 8192 events => throughput is 2.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4004s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3592s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166122) differ by less than 3E-14 (2.9531932455029164e-14) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351262530) differ by less than 3E-14 (2.9531932455029164e-14) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561281] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6201s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2532s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3670s for 90112 events => throughput is 2.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9658s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5172s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4486s for 90112 events => throughput is 2.01E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343823) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561281) differ by less than 3E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.474720e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.019219e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.504610e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.018294e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3021s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2850s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3643s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3408s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.49E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226551166922) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4255s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2356s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1899s for 90112 events => throughput is 4.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7585s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4994s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2591s for 90112 events => throughput is 3.48E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679754343823) differ by less than 3E-14 (0.0) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.848976e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.297018e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.846140e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.427747e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3623s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0225s for 8192 events => throughput is 3.65E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8132s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5645s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2486s for 90112 events => throughput is 3.62E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.905513e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.866043e+05 ) sec^-1 -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3815s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3495s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.56E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263341) differ by less than 3E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8893s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5364s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3529s for 90112 events => throughput is 2.55E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ by less than 3E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.640953e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.543334e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539351263363] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7465s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7458s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263363) differ by less than 3E-14 (1.3322676295501878e-15) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686556561304] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.9068s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8993s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.20E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561304) differ by less than 3E-14 (4.440892098500626e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.589846e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.058801e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.383441e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.512285e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.382616e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.771039e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.376307e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.776386e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index e0426fd000..63166c80e0 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-03_17:03:57 +DATE: 2024-03-01_03:52:02 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3591s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3110s - [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4536s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3837s + [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3133s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2652s - [COUNTERS] Fortran MEs ( 1 ) : 0.0481s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3210s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7408s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2155s - [COUNTERS] Fortran MEs ( 1 ) : 0.5254s for 90112 events => throughput is 1.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2714s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5019s + [COUNTERS] Fortran MEs ( 1 ) : 0.7695s for 90112 events => throughput is 1.17E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110149549279866] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110463093540638] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3789s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3226s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0563s for 8192 events => throughput is 1.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3882s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0704s for 8192 events => throughput is 1.16E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110149549279866) differ by less than 4E-4 (2.840326210895583e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110463093540638) differ by less than 4E-4 (2.812844174915341e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510678843355344] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686273216112] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9062s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2863s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6199s for 90112 events => throughput is 1.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3150s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5373s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7777s for 90112 events => throughput is 1.16E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510678843355344) differ by less than 4E-4 (4.2350520312872675e-08) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686273216112) differ by less than 4E-4 (1.3172298474195543e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.483641e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.170698e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.487281e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.161745e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110146988852984] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110459152958460] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3063s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2863s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0200s for 8192 events => throughput is 4.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3657s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3405s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0252s for 8192 events => throughput is 3.25E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110146988852984) differ by less than 4E-4 (2.934771267448788e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110459152958460) differ by less than 4E-4 (2.9581965829139634e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510676993136629] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510683016166510] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4589s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2387s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2202s for 90112 events => throughput is 4.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7697s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4943s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2753s for 90112 events => throughput is 3.27E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510676993136629) differ by less than 4E-4 (1.2836447871311663e-07) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510683016166510) differ by less than 4E-4 (1.6458771667782202e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.266778e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.219045e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.196387e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.229652e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110148793566186] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2845s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2748s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0097s for 8192 events => throughput is 8.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3421s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3299s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.71E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110148793566186) differ by less than 4E-4 (2.8682018052839098e-06) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460595003461) differ by less than 4E-4 (2.9050052766654844e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510676419088856] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.3376s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2310s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1067s for 90112 events => throughput is 8.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6208s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4844s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1364s for 90112 events => throughput is 6.61E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510676419088856) differ by less than 4E-4 (1.5505111905511626e-07) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ by less than 4E-4 (1.8848637739488083e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.755084e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.431027e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.808083e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.412727e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3415s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3300s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0116s for 8192 events => throughput is 7.08E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110460595003461) differ by less than 4E-4 (2.9050052766654844e-06) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.6084s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1253s for 90112 events => throughput is 7.19E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ by less than 4E-4 (1.8848637739488083e-07) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.891581e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.928440e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110464176080312] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3483s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3329s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.33E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110464176080312) differ by less than 4E-4 (2.772913590631809e-06) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510685411522326] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.6561s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4840s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1721s for 90112 events => throughput is 5.24E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510685411522326) differ by less than 4E-4 (5.3231167917999755e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.988554e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.962392e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110478167944563] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7423s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7418s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110478167944563) differ by less than 4E-4 (2.2568093527297606e-06) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510689885789414] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8968s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8910s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.53E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510689885789414) differ by less than 4E-4 (1.547708907700951e-07) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.824058e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.473484e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.891145e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.706092e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.798334e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.787777e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.356687e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.028611e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index e46d313ea3..eb4ca92d13 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-03_17:04:17 +DATE: 2024-03-01_03:52:30 -On uan02 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3623s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3142s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4522s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3828s + [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226551166922] fbridge_mode=0 + [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3135s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2653s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3858s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3164s + [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679754343823] fbridge_mode=0 + [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7475s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2214s - [COUNTERS] Fortran MEs ( 1 ) : 0.5261s for 90112 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2499s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4895s + [COUNTERS] Fortran MEs ( 1 ) : 0.7604s for 90112 events => throughput is 1.19E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226549005623] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4039s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3346s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0693s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4694s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3943s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226549005623) differ by less than 2E-4 (7.972267290767832e-11) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348916002) differ by less than 2E-4 (8.658362915525686e-11) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679758658835] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.0500s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2907s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7593s for 90112 events => throughput is 1.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4601s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6015s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8586s for 90112 events => throughput is 1.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679758658835) differ by less than 2E-4 (2.0059864880295208e-10) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794337) differ by less than 2E-4 (1.967879192932287e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.207970e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.100770e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.211751e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.090853e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226549005628] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3342s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3008s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0334s for 8192 events => throughput is 2.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3940s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0394s for 8192 events => throughput is 2.08E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226549005628) differ by less than 2E-4 (7.972245086307339e-11) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539348916002) differ by less than 2E-4 (8.658362915525686e-11) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679758658832] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6263s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2582s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3682s for 90112 events => throughput is 2.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9359s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5057s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4302s for 90112 events => throughput is 2.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679758658832) differ by less than 2E-4 (2.0059842675834716e-10) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794334) differ by less than 2E-4 (1.9678769724862377e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.493733e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.020468e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.488023e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.027641e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110226530029391] fbridge_mode=1 + [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3020s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2847s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3636s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3406s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.56E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.27110226551166922) and cpp (0.27110226530029391) differ by less than 2E-4 (7.796884249344771e-10) +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510679756340242] fbridge_mode=1 + [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4279s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2391s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1888s for 90112 events => throughput is 4.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7468s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2545s for 90112 events => throughput is 3.54E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21510679754343823) and cpp (0.21510679756340242) differ by less than 2E-4 (9.281064805577444e-11) +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.847689e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.536848e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.864616e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.536744e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.3573s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3372s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0201s for 8192 events => throughput is 4.08E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.7304s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5047s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2257s for 90112 events => throughput is 3.99E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.887668e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.834847e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.4046s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3689s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 8192 events => throughput is 2.30E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539330272815) differ by less than 2E-4 (7.742566587864985e-10) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.9542s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5763s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3779s for 90112 events => throughput is 2.38E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ by less than 2E-4 (9.253309229961815e-11) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.510568e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.320811e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539343558537] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7473s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7466s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539343558537) differ by less than 2E-4 (2.8419910869104115e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8944s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8868s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.19E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686553631395) differ by less than 2E-4 (1.3620671257541517e-10) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.579519e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.134868e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.391789e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.511629e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.394001e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.800973e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.396936e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.776316e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index ad41cc6bfb..baa8c044cd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:01:50 +DATE: 2024-03-01_02:23:52 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.324596e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.112739e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.342398e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.859560 sec - 1,321,428,084 cycles:u # 1.235 GHz (74.81%) - 2,150,562 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.38%) - 5,995,950 stalled-cycles-backend:u # 0.45% backend cycles idle (74.89%) - 2,074,374,266 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (75.26%) - 1.422954208 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.465816e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.330908e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.240172e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.907657 sec + 2,864,594,511 cycles # 3.017 GHz + 4,419,491,827 instructions # 1.54 insn per cycle + 1.243823060 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.252614e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.432573e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.432573e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.754991 sec - 19,449,019,198 cycles:u # 3.361 GHz (74.99%) - 51,261,991 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.99%) - 55,689,936 stalled-cycles-backend:u # 0.29% backend cycles idle (74.91%) - 47,091,295,354 instructions:u # 2.42 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 5.789720623 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.117981e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.310106e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.310106e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.029383 sec + 18,345,746,310 cycles # 3.041 GHz + 43,971,705,846 instructions # 2.40 insn per cycle + 6.038464488 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.927684e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.425335e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.425335e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.992105 sec - 13,278,223,442 cycles:u # 3.301 GHz (74.95%) - 52,332,939 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.96%) - 1,017,469,682 stalled-cycles-backend:u # 7.66% backend cycles idle (74.96%) - 31,194,174,433 instructions:u # 2.35 insn per cycle - # 0.03 stalled cycles per insn (74.96%) - 4.028308592 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.673850e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.186329e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.186329e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.154865 sec + 12,823,382,487 cycles # 3.082 GHz + 30,998,172,347 instructions # 2.42 insn per cycle + 4.171623433 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.667453e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.548951e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.548951e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.088586 sec - 10,124,819,179 cycles:u # 3.246 GHz (74.93%) - 47,833,768 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.89%) - 414,834,375 stalled-cycles-backend:u # 4.10% backend cycles idle (74.87%) - 19,410,775,333 instructions:u # 1.92 insn per cycle - # 0.02 stalled cycles per insn (74.97%) - 3.123576071 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.086690e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.914110e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.914110e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.406763 sec + 10,081,289,557 cycles # 2.955 GHz + 19,366,111,959 instructions # 1.92 insn per cycle + 3.427414790 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.191873e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.083636e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.083636e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.257696 sec + 9,685,682,355 cycles # 2.968 GHz + 18,976,171,527 instructions # 1.96 insn per cycle + 3.273948471 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.805262e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.408203e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.408203e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.888242 sec + 8,621,851,062 cycles # 2.214 GHz + 15,727,334,662 instructions # 1.82 insn per cycle + 3.905958468 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 02c3c2eb21..b9ff72dbf3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,175 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:51:40 +DATE: 2024-03-01_03:12:58 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.474799e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.308389e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.308389e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.547677 sec - 18,355,875,731 cycles:u # 3.290 GHz (74.87%) - 120,621,827 stalled-cycles-frontend:u # 0.66% frontend cycles idle (74.92%) - 6,921,495,641 stalled-cycles-backend:u # 37.71% backend cycles idle (75.07%) - 17,150,236,773 instructions:u # 0.93 insn per cycle - # 0.40 stalled cycles per insn (75.06%) - 5.611745225 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.687342e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.551417e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.551417e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.232505 sec + 7,524,955,995 cycles # 3.041 GHz + 13,468,669,108 instructions # 1.79 insn per cycle + 2.532807464 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.236454e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.411058e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.411058e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.925739 sec - 19,889,727,529 cycles:u # 3.334 GHz (74.92%) - 51,462,014 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.92%) - 116,028,144 stalled-cycles-backend:u # 0.58% backend cycles idle (74.99%) - 47,213,004,904 instructions:u # 2.37 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 5.968530389 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.081573e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.260544e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.260544e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.415532 sec + 19,561,606,037 cycles # 3.046 GHz + 44,198,639,919 instructions # 2.26 insn per cycle + 6.422457347 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.876382e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.345724e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.345724e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.222665 sec - 13,876,988,167 cycles:u # 3.256 GHz (75.04%) - 53,634,197 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.04%) - 1,081,837,065 stalled-cycles-backend:u # 7.80% backend cycles idle (74.88%) - 32,066,862,891 instructions:u # 2.31 insn per cycle - # 0.03 stalled cycles per insn (74.88%) - 4.266640956 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.552230e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.996603e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.996603e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.664054 sec + 13,997,557,946 cycles # 2.998 GHz + 31,841,279,233 instructions # 2.27 insn per cycle + 4.670791737 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.550661e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.348496e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.348496e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.349151 sec - 10,813,502,985 cycles:u # 3.190 GHz (75.01%) - 50,474,326 stalled-cycles-frontend:u # 0.47% frontend cycles idle (75.00%) - 406,381,998 stalled-cycles-backend:u # 3.76% backend cycles idle (75.00%) - 20,750,963,840 instructions:u # 1.92 insn per cycle - # 0.02 stalled cycles per insn (74.98%) - 3.393666389 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.951455e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.660973e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.660973e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.823801 sec + 11,324,833,068 cycles # 2.957 GHz + 20,724,775,427 instructions # 1.83 insn per cycle + 3.830534322 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.028218e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.792747e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.792747e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.704930 sec + 10,963,593,820 cycles # 2.954 GHz + 20,347,072,159 instructions # 1.86 insn per cycle + 3.711957869 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.747913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.283053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.283053e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.214412 sec + 9,956,996,891 cycles # 2.360 GHz + 16,873,658,319 instructions # 1.69 insn per cycle + 4.221168968 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 0b5bb72c22..09aaad1dd8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,165 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_15:03:14 +DATE: 2024-03-01_03:26:09 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.256929e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.117141e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.346881e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.492636e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.583078e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.097014e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.665450 sec - 15,369,141,348 cycles:u # 3.275 GHz (75.00%) - 53,792,556 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.98%) - 6,901,191,727 stalled-cycles-backend:u # 44.90% backend cycles idle (74.94%) - 11,509,551,541 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (74.93%) - 4.717205353 seconds time elapsed +TOTAL : 1.329039 sec + 4,626,136,964 cycles # 2.966 GHz + 7,229,705,832 instructions # 1.56 insn per cycle + 1.616136536 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.250207e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.429729e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.429729e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.120496e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.314160e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.314160e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.768692 sec - 19,540,266,544 cycles:u # 3.370 GHz (74.89%) - 50,423,911 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.96%) - 72,392,327 stalled-cycles-backend:u # 0.37% backend cycles idle (75.03%) - 47,003,028,682 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 5.801685990 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.368910 sec + 19,436,039,687 cycles # 3.050 GHz + 44,075,637,403 instructions # 2.27 insn per cycle + 6.374367735 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.928009e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.423643e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.423643e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.684337e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204179e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204179e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.989684 sec - 13,290,935,841 cycles:u # 3.306 GHz (74.93%) - 52,308,037 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.94%) - 1,092,395,998 stalled-cycles-backend:u # 8.22% backend cycles idle (74.94%) - 31,201,564,853 instructions:u # 2.35 insn per cycle - # 0.04 stalled cycles per insn (74.93%) - 4.022478905 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.477126 sec + 13,840,650,655 cycles # 3.088 GHz + 31,000,398,658 instructions # 2.24 insn per cycle + 4.482579907 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.658913e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.545624e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.545624e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.074274e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.910197e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.910197e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.096330 sec - 10,149,407,506 cycles:u # 3.246 GHz (74.93%) - 49,215,562 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.82%) - 419,084,250 stalled-cycles-backend:u # 4.13% backend cycles idle (74.82%) - 19,401,968,357 instructions:u # 1.91 insn per cycle - # 0.02 stalled cycles per insn (74.96%) - 3.129271544 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +TOTAL : 3.779571 sec + 11,221,356,305 cycles # 2.967 GHz + 19,268,573,834 instructions # 1.72 insn per cycle + 3.784933241 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.174998e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.082449e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.082449e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.643336 sec + 10,818,026,445 cycles # 2.966 GHz + 18,676,470,141 instructions # 1.73 insn per cycle + 3.648853496 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.875863e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.507498e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.507498e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.111357 sec + 9,725,602,646 cycles # 2.364 GHz + 15,429,502,829 instructions # 1.59 insn per cycle + 4.116843302 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 5417009137..4a4acadae4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,169 +1,212 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:59:28 +DATE: 2024-03-01_03:19:38 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.514751e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.084739e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.314117e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.378218 sec - 17,859,605,335 cycles:u # 3.302 GHz (75.01%) - 119,964,353 stalled-cycles-frontend:u # 0.67% frontend cycles idle (75.01%) - 6,837,606,477 stalled-cycles-backend:u # 38.29% backend cycles idle (75.06%) - 16,740,556,145 instructions:u # 0.94 insn per cycle - # 0.41 stalled cycles per insn (75.04%) - 5.429728919 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.223584e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.552038e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.038459e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 1.841184 sec + 6,281,268,865 cycles # 3.032 GHz + 11,616,541,551 instructions # 1.85 insn per cycle + 2.127335919 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.251851e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.431006e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.431006e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.756539 sec - 19,485,062,223 cycles:u # 3.368 GHz (74.98%) - 51,536,175 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.98%) - 59,518,097 stalled-cycles-backend:u # 0.31% backend cycles idle (74.99%) - 47,089,103,622 instructions:u # 2.42 insn per cycle - # 0.00 stalled cycles per insn (74.92%) - 5.789154373 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.136861e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.332827e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.332827e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.931254 sec + 18,320,874,631 cycles # 3.087 GHz + 43,971,483,251 instructions # 2.40 insn per cycle + 5.936943481 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.934074e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.436656e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.436656e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.982711 sec - 13,292,890,354 cycles:u # 3.312 GHz (74.90%) - 52,677,465 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.88%) - 1,040,209,336 stalled-cycles-backend:u # 7.83% backend cycles idle (74.94%) - 31,137,052,240 instructions:u # 2.34 insn per cycle - # 0.03 stalled cycles per insn (75.04%) - 4.015553191 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.678735e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.191487e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.191487e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.142725 sec + 12,747,370,194 cycles # 3.074 GHz + 30,997,666,885 instructions # 2.43 insn per cycle + 4.148307465 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.669189e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.552335e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.552335e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.086401 sec - 10,130,156,702 cycles:u # 3.250 GHz (74.87%) - 47,813,678 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.85%) - 431,077,569 stalled-cycles-backend:u # 4.26% backend cycles idle (74.95%) - 19,360,973,007 instructions:u # 1.91 insn per cycle - # 0.02 stalled cycles per insn (75.08%) - 3.118993026 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.080045e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.910176e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.910176e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.411600 sec + 10,085,079,136 cycles # 2.953 GHz + 19,364,558,625 instructions # 1.92 insn per cycle + 3.417084709 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.138969e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.032835e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.032835e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.338836 sec + 9,731,023,917 cycles # 2.911 GHz + 18,988,816,377 instructions # 1.95 insn per cycle + 3.344328310 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.865281e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.489559e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.489559e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.766791 sec + 8,586,243,314 cycles # 2.277 GHz + 15,726,194,960 instructions # 1.83 insn per cycle + 3.772300478 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 79e3941e0f..acaec4a100 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:02:18 +DATE: 2024-03-01_02:24:28 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.920060e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.601512e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.923626e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.491128 sec - 1,290,175,959 cycles:u # 2.505 GHz (75.02%) - 2,375,328 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.49%) - 5,596,436 stalled-cycles-backend:u # 0.43% backend cycles idle (74.55%) - 2,025,639,080 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (74.55%) - 0.545806838 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.477749e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.322801e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.215924e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.699180 sec + 2,815,032,547 cycles # 3.020 GHz + 4,411,732,319 instructions # 1.57 insn per cycle + 1.012826906 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.324266e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.526356e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.526356e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.479692 sec - 18,527,886,497 cycles:u # 3.363 GHz (74.94%) - 51,303,501 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.01%) - 71,667,286 stalled-cycles-backend:u # 0.39% backend cycles idle (75.03%) - 44,741,161,010 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 5.513054212 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 485) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.177941e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.396494e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.396494e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.744811 sec + 17,454,360,700 cycles # 3.039 GHz + 41,822,159,126 instructions # 2.40 insn per cycle + 5.754685240 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.023611e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.577643e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.577643e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.835861 sec - 12,754,913,480 cycles:u # 3.299 GHz (74.99%) - 52,420,621 stalled-cycles-frontend:u # 0.41% frontend cycles idle (74.97%) - 73,335,748 stalled-cycles-backend:u # 0.57% backend cycles idle (74.97%) - 30,104,394,151 instructions:u # 2.36 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 3.870675775 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.724349e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.269291e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.269291e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.046627 sec + 12,493,235,601 cycles # 3.083 GHz + 30,160,547,265 instructions # 2.41 insn per cycle + 4.067076512 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.602054e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.435734e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.435734e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.149196 sec - 10,399,600,756 cycles:u # 3.270 GHz (74.90%) - 52,047,342 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.84%) - 302,365,095 stalled-cycles-backend:u # 2.91% backend cycles idle (74.92%) - 18,912,332,875 instructions:u # 1.82 insn per cycle - # 0.02 stalled cycles per insn (75.05%) - 3.184201759 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1884) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.121345e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.968992e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.968992e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.357760 sec + 9,927,136,910 cycles # 2.952 GHz + 19,096,793,241 instructions # 1.92 insn per cycle + 3.375474470 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.204942e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.126738e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.126738e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.243150 sec + 9,616,213,299 cycles # 2.960 GHz + 18,757,748,925 instructions # 1.95 insn per cycle + 3.265371118 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.914682e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.579340e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.579340e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.680994 sec + 8,464,459,891 cycles # 2.296 GHz + 15,603,182,673 instructions # 1.84 insn per cycle + 3.700542167 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index d45f8d9d60..5e36a6ad1c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:32:31 +DATE: 2024-03-01_03:02:07 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.257790e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.106060e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.334611e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.510314 sec - 1,324,021,627 cycles:u # 2.494 GHz (74.45%) - 2,359,804 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.14%) - 5,360,054 stalled-cycles-backend:u # 0.40% backend cycles idle (74.44%) - 2,075,244,536 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (75.14%) - 0.565639760 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.482201e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.589772e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.144008e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.677531 sec + 2,738,360,567 cycles # 3.010 GHz + 4,202,554,319 instructions # 1.53 insn per cycle + 0.971727419 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.786118e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.174032e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.174032e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.252899 sec - 14,197,263,778 cycles:u # 3.313 GHz (74.99%) - 51,197,569 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.99%) - 401,360,207 stalled-cycles-backend:u # 2.83% backend cycles idle (75.00%) - 36,769,260,973 instructions:u # 2.59 insn per cycle - # 0.01 stalled cycles per insn (75.00%) - 4.288026881 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.697362e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.176157e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.176157e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.107132 sec + 12,669,493,888 cycles # 3.081 GHz + 32,513,570,576 instructions # 2.57 insn per cycle + 4.112837024 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.413369e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.248265e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.248265e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.336198 sec - 11,000,487,394 cycles:u # 3.267 GHz (74.87%) - 51,674,634 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.99%) - 251,927,164 stalled-cycles-backend:u # 2.29% backend cycles idle (75.05%) - 24,665,290,150 instructions:u # 2.24 insn per cycle - # 0.01 stalled cycles per insn (75.06%) - 3.371408592 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.109105e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.012747e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.012747e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.385880 sec + 10,259,128,837 cycles # 3.025 GHz + 24,473,597,991 instructions # 2.39 insn per cycle + 3.391687112 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.010543e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.186949e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.186949e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.820748 sec - 9,177,598,955 cycles:u # 3.218 GHz (74.96%) - 48,256,657 stalled-cycles-frontend:u # 0.53% frontend cycles idle (75.04%) - 140,263,482 stalled-cycles-backend:u # 1.53% backend cycles idle (75.04%) - 16,797,741,901 instructions:u # 1.83 insn per cycle - # 0.01 stalled cycles per insn (75.04%) - 2.856117399 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.263099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.319180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.319180e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.179158 sec + 9,139,183,085 cycles # 2.870 GHz + 16,922,980,195 instructions # 1.85 insn per cycle + 3.185130704 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.177097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.324804e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.324804e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.299126 sec + 9,225,486,663 cycles # 2.804 GHz + 16,350,529,622 instructions # 1.77 insn per cycle + 3.305119215 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.061533e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.856351e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.856351e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.449960 sec + 7,914,148,444 cycles # 2.292 GHz + 14,582,993,732 instructions # 1.84 insn per cycle + 3.455623027 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 2554b2b401..640cde8efe 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:32:55 +DATE: 2024-03-01_03:02:37 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.884470e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.600483e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.922931e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.490936 sec - 1,262,401,840 cycles:u # 2.452 GHz (74.12%) - 2,354,272 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.27%) - 5,127,550 stalled-cycles-backend:u # 0.41% backend cycles idle (75.30%) - 2,046,536,140 instructions:u # 1.62 insn per cycle - # 0.00 stalled cycles per insn (75.85%) - 0.545640180 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.480008e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.624168e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.202092e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.676373 sec + 2,668,503,996 cycles # 2.929 GHz + 4,153,523,497 instructions # 1.56 insn per cycle + 0.971892133 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.448227e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.241157e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.241157e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.300684 sec - 10,864,613,767 cycles:u # 3.260 GHz (74.93%) - 50,513,894 stalled-cycles-frontend:u # 0.46% frontend cycles idle (75.04%) - 58,038,331 stalled-cycles-backend:u # 0.53% backend cycles idle (75.04%) - 28,417,485,541 instructions:u # 2.62 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 3.335392064 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.254295e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.186891e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186891e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.188433 sec + 9,833,021,244 cycles # 3.080 GHz + 25,393,539,961 instructions # 2.58 insn per cycle + 3.194101979 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.640077e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.678024e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.678024e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.110817 sec - 10,212,854,193 cycles:u # 3.251 GHz (74.84%) - 49,334,287 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.91%) - 54,421,651 stalled-cycles-backend:u # 0.53% backend cycles idle (75.04%) - 21,633,351,327 instructions:u # 2.12 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 3.146041531 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.515638e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.869932e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.869932e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.899703 sec + 8,920,893,128 cycles # 3.072 GHz + 21,482,466,118 instructions # 2.41 insn per cycle + 2.905533602 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.296083e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.760867e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.760867e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.640234 sec - 8,552,163,954 cycles:u # 3.202 GHz (74.92%) - 48,388,466 stalled-cycles-frontend:u # 0.57% frontend cycles idle (74.87%) - 139,356,364 stalled-cycles-backend:u # 1.63% backend cycles idle (74.85%) - 15,849,743,897 instructions:u # 1.85 insn per cycle - # 0.01 stalled cycles per insn (74.94%) - 2.675594881 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1479) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.523191e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.858970e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.858970e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.882396 sec + 8,595,793,495 cycles # 2.978 GHz + 15,810,706,009 instructions # 1.84 insn per cycle + 2.888136564 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165084E-002 -Relative difference = 1.0277089582483854e-08 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.508044e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.828642e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.828642e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.905551 sec + 8,435,887,633 cycles # 2.898 GHz + 15,503,428,881 instructions # 1.84 insn per cycle + 2.911395780 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.236518e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.188285e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.188285e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.208349 sec + 7,562,205,797 cycles # 2.353 GHz + 14,282,233,625 instructions # 1.89 insn per cycle + 3.214128577 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165088E-002 +Relative difference = 1.0277089312025782e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 192f203417..4388b968c1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:02:44 +DATE: 2024-03-01_02:25:01 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.894573e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.208938e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.968247e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.384630 sec - 968,263,305 cycles:u # 2.395 GHz (74.33%) - 2,353,929 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.20%) - 4,170,768 stalled-cycles-backend:u # 0.43% backend cycles idle (73.94%) - 1,766,876,262 instructions:u # 1.82 insn per cycle - # 0.00 stalled cycles per insn (76.21%) - 0.433113828 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.096246e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.080730e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.278086e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.584592 sec + 2,424,873,450 cycles # 2.992 GHz + 3,757,113,510 instructions # 1.55 insn per cycle + 0.891497126 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.421004e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.651144e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.651144e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.098926 sec - 17,283,392,500 cycles:u # 3.371 GHz (75.02%) - 39,987,964 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.04%) - 29,814,946 stalled-cycles-backend:u # 0.17% backend cycles idle (75.04%) - 47,182,351,433 instructions:u # 2.73 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 5.129339329 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.144766e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.356973e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.356973e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.861200 sec + 17,835,681,737 cycles # 3.040 GHz + 43,512,863,183 instructions # 2.44 insn per cycle + 5.870178360 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.949337e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.195301e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.195301e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.811402 sec - 9,236,170,641 cycles:u # 3.254 GHz (74.93%) - 40,377,405 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.92%) - 911,350,097 stalled-cycles-backend:u # 9.87% backend cycles idle (74.94%) - 22,187,807,240 instructions:u # 2.40 insn per cycle - # 0.04 stalled cycles per insn (74.94%) - 2.842564880 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.374028e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.640654e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.640654e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.010180 sec + 9,264,818,102 cycles # 3.072 GHz + 21,907,230,972 instructions # 2.36 insn per cycle + 3.030108679 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.416658e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.002062e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.002062e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.515569 sec - 8,226,874,549 cycles:u # 3.236 GHz (74.74%) - 42,467,746 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.71%) - 1,634,949,869 stalled-cycles-backend:u # 19.87% backend cycles idle (75.00%) - 15,532,344,605 instructions:u # 1.89 insn per cycle - # 0.11 stalled cycles per insn (75.14%) - 2.546576367 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.583102e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.970498e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.970498e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.786671 sec + 8,293,439,755 cycles # 2.970 GHz + 15,591,050,714 instructions # 1.88 insn per cycle + 2.803351674 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.519812e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.882018e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.882018e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.857922 sec + 8,240,284,445 cycles # 2.878 GHz + 15,434,807,288 instructions # 1.87 insn per cycle + 2.873134335 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.640401e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.080150e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.080150e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.738177 sec + 6,634,758,903 cycles # 2.418 GHz + 12,863,535,626 instructions # 1.94 insn per cycle + 2.752418443 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 13d81de7e2..5ebf98d844 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,175 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:52:12 +DATE: 2024-03-01_03:13:35 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.603281e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.306866e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.306866e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 5.394868 sec - 17,747,723,699 cycles:u # 3.296 GHz (75.06%) - 116,293,455 stalled-cycles-frontend:u # 0.66% frontend cycles idle (75.05%) - 6,964,156,661 stalled-cycles-backend:u # 39.24% backend cycles idle (75.08%) - 17,085,845,774 instructions:u # 0.96 insn per cycle - # 0.41 stalled cycles per insn (75.02%) - 5.448649910 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.291092e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.500878e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.500878e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.664885 sec + 5,743,008,286 cycles # 3.032 GHz + 10,353,112,228 instructions # 1.80 insn per cycle + 1.950710268 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.407290e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.631935e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.631935e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.209030 sec - 17,512,513,676 cycles:u # 3.342 GHz (74.97%) - 39,347,428 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.98%) - 73,710,188 stalled-cycles-backend:u # 0.42% backend cycles idle (74.98%) - 47,446,515,991 instructions:u # 2.71 insn per cycle - # 0.00 stalled cycles per insn (74.97%) - 5.243117391 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.118079e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.318846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318846e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.094512 sec + 18,492,834,117 cycles # 3.035 GHz + 43,665,828,462 instructions # 2.36 insn per cycle + 6.100764200 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.843798e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.993414e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.993414e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.973264 sec - 9,687,625,736 cycles:u # 3.224 GHz (74.98%) - 41,442,139 stalled-cycles-frontend:u # 0.43% frontend cycles idle (74.98%) - 958,118,871 stalled-cycles-backend:u # 9.89% backend cycles idle (74.87%) - 23,576,025,926 instructions:u # 2.43 insn per cycle - # 0.04 stalled cycles per insn (74.87%) - 3.009926908 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.278046e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.410824e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.410824e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.242674 sec + 9,984,073,322 cycles # 3.074 GHz + 23,241,211,318 instructions # 2.33 insn per cycle + 3.248988906 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.317531e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.795026e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.795026e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.649491 sec - 8,549,300,727 cycles:u # 3.188 GHz (74.94%) - 42,286,661 stalled-cycles-frontend:u # 0.49% frontend cycles idle (74.96%) - 1,634,655,298 stalled-cycles-backend:u # 19.12% backend cycles idle (74.96%) - 16,685,077,328 instructions:u # 1.95 insn per cycle - # 0.10 stalled cycles per insn (74.94%) - 2.685772670 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.460715e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.687913e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.687913e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.031931 sec + 9,018,287,343 cycles # 2.969 GHz + 16,710,480,351 instructions # 1.85 insn per cycle + 3.038355322 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.487042e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742069e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742069e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.003313 sec + 8,924,279,581 cycles # 2.966 GHz + 16,553,851,203 instructions # 1.85 insn per cycle + 3.009721457 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.456097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.675362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.675362e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 3.047824 sec + 7,411,564,908 cycles # 2.428 GHz + 14,070,800,087 instructions # 1.90 insn per cycle + 3.054259465 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 6f27605efb..57f3a9eb6a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,165 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_15:03:45 +DATE: 2024-03-01_03:26:45 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.826583e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.171348e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.927510e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.542977 sec - 15,015,738,206 cycles:u # 3.288 GHz (75.04%) - 54,154,072 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.83%) - 6,892,515,673 stalled-cycles-backend:u # 45.90% backend cycles idle (74.86%) - 11,015,292,588 instructions:u # 0.73 insn per cycle - # 0.63 stalled cycles per insn (75.12%) - 4.592827207 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.305418e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.176873e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.254170e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 +TOTAL : 1.176348 sec + 4,160,459,328 cycles # 2.977 GHz + 6,608,736,714 instructions # 1.59 insn per cycle + 1.454481545 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.412776e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.637786e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.637786e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.163258e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.379965e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.379965e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.125912 sec - 17,371,936,407 cycles:u # 3.372 GHz (74.98%) - 39,561,209 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.00%) - 71,336,012 stalled-cycles-backend:u # 0.41% backend cycles idle (75.00%) - 47,201,038,679 instructions:u # 2.72 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 5.154937938 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.084905 sec + 18,848,150,042 cycles # 3.095 GHz + 43,694,410,467 instructions # 2.32 insn per cycle + 6.090122961 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.929734e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.146974e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.146974e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.362188e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.607795e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.607795e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.822597 sec - 9,288,450,283 cycles:u # 3.260 GHz (74.99%) - 40,549,445 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.01%) - 929,075,556 stalled-cycles-backend:u # 10.00% backend cycles idle (75.01%) - 22,142,860,809 instructions:u # 2.38 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 2.851562031 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.340145 sec + 10,237,006,523 cycles # 3.061 GHz + 21,987,992,116 instructions # 2.15 insn per cycle + 3.345494687 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.414461e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.996487e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.996487e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.557177e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.937995e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.937995e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.516320 sec - 8,209,636,772 cycles:u # 3.229 GHz (74.86%) - 42,016,189 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.86%) - 1,641,180,581 stalled-cycles-backend:u # 19.99% backend cycles idle (74.92%) - 15,546,166,497 instructions:u # 1.89 insn per cycle - # 0.11 stalled cycles per insn (75.08%) - 2.545171918 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +TOTAL : 3.130033 sec + 9,276,164,079 cycles # 2.959 GHz + 15,501,530,354 instructions # 1.67 insn per cycle + 3.135291294 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.607828e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022471e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.022471e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 3.090209 sec + 9,218,829,691 cycles # 2.980 GHz + 15,143,949,757 instructions # 1.64 insn per cycle + 3.095551418 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.625698e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.049871e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.049871e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 3.081111 sec + 7,633,670,846 cycles # 2.474 GHz + 12,572,894,419 instructions # 1.65 insn per cycle + 3.086406325 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 75ba62d6e0..8d8716bc9a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,169 +1,212 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:59:59 +DATE: 2024-03-01_03:20:14 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.352458e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.000530e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.714853e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 5.254995 sec - 17,501,585,656 cycles:u # 3.313 GHz (75.02%) - 117,376,845 stalled-cycles-frontend:u # 0.67% frontend cycles idle (75.03%) - 6,895,746,974 stalled-cycles-backend:u # 39.40% backend cycles idle (75.03%) - 16,722,875,951 instructions:u # 0.96 insn per cycle - # 0.41 stalled cycles per insn (75.03%) - 5.301503353 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.282885e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.142631e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.141870e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.452761 sec + 5,067,036,613 cycles # 3.030 GHz + 9,262,361,364 instructions # 1.83 insn per cycle + 1.731002061 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.422792e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.651217e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.651217e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.093515 sec - 17,258,116,112 cycles:u # 3.371 GHz (75.00%) - 39,040,967 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.00%) - 35,911,867 stalled-cycles-backend:u # 0.21% backend cycles idle (75.00%) - 47,203,584,739 instructions:u # 2.74 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 5.122349856 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.160324e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.375621e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.375621e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.780149 sec + 17,815,433,670 cycles # 3.080 GHz + 43,511,102,764 instructions # 2.44 insn per cycle + 5.785180938 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.953939e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.195964e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.195964e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.805057 sec - 9,229,935,831 cycles:u # 3.260 GHz (74.87%) - 41,364,885 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.88%) - 932,793,246 stalled-cycles-backend:u # 10.11% backend cycles idle (74.90%) - 22,162,738,336 instructions:u # 2.40 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 2.833936052 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.389771e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.650423e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.650423e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.992624 sec + 9,227,327,267 cycles # 3.079 GHz + 21,906,426,544 instructions # 2.37 insn per cycle + 2.997895192 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.413730e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.999080e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.999080e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.515895 sec - 8,211,824,655 cycles:u # 3.230 GHz (74.85%) - 42,295,685 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.85%) - 1,609,403,252 stalled-cycles-backend:u # 19.60% backend cycles idle (74.90%) - 15,547,822,400 instructions:u # 1.89 insn per cycle - # 0.10 stalled cycles per insn (75.07%) - 2.544692512 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.528530e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.865855e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.865855e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.845512 sec + 8,254,984,848 cycles # 2.896 GHz + 15,590,498,904 instructions # 1.89 insn per cycle + 2.850900280 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.609279e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.018312e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018312e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.764714 sec + 8,215,374,590 cycles # 2.969 GHz + 15,429,066,515 instructions # 1.88 insn per cycle + 2.770036927 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.648656e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.090784e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.090784e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.731162 sec + 6,615,238,340 cycles # 2.419 GHz + 12,862,797,254 instructions # 1.94 insn per cycle + 2.736410000 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052585973637E-002 +Relative difference = 2.0158743040564767e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 7fb4b0ecf3..f9e4000e6d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:03:08 +DATE: 2024-03-01_02:25:31 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.881152e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.242181e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.006805e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.384701 sec - 975,692,681 cycles:u # 2.413 GHz (74.26%) - 2,357,292 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.34%) - 4,115,956 stalled-cycles-backend:u # 0.42% backend cycles idle (74.06%) - 1,796,776,398 instructions:u # 1.84 insn per cycle - # 0.00 stalled cycles per insn (76.05%) - 0.433515678 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.096943e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095054e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.337200e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.581297 sec + 2,416,875,461 cycles # 3.000 GHz + 3,802,904,431 instructions # 1.57 insn per cycle + 0.886522859 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.546587e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.820453e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.820453e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.740705 sec - 16,005,689,600 cycles:u # 3.357 GHz (75.00%) - 39,140,113 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.00%) - 24,772,897 stalled-cycles-backend:u # 0.15% backend cycles idle (75.00%) - 44,042,185,394 instructions:u # 2.75 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 4.770282414 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.237656e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.486670e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.486670e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.444566 sec + 16,726,225,777 cycles # 3.070 GHz + 41,270,625,621 instructions # 2.47 insn per cycle + 5.454849598 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039854866802E-002 +Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.018225e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.336340e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.336340e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.759305 sec - 9,080,357,138 cycles:u # 3.259 GHz (74.94%) - 42,050,430 stalled-cycles-frontend:u # 0.46% frontend cycles idle (75.02%) - 544,746,557 stalled-cycles-backend:u # 6.00% backend cycles idle (75.02%) - 21,614,717,358 instructions:u # 2.38 insn per cycle - # 0.03 stalled cycles per insn (75.02%) - 2.790124475 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.460514e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.827007e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.827007e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.914617 sec + 8,996,783,237 cycles # 3.081 GHz + 21,210,998,059 instructions # 2.36 insn per cycle + 2.929493898 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.470116e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.115335e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.115335e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.485521 sec - 8,111,476,454 cycles:u # 3.229 GHz (74.89%) - 42,631,186 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.87%) - 1,607,734,533 stalled-cycles-backend:u # 19.82% backend cycles idle (74.87%) - 15,371,769,451 instructions:u # 1.90 insn per cycle - # 0.10 stalled cycles per insn (74.95%) - 2.516352054 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2524) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.611163e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022551e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.022551e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.760181 sec + 8,249,336,928 cycles # 2.983 GHz + 15,425,238,678 instructions # 1.87 insn per cycle + 2.778856529 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.587140e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.018405e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018405e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.789811 sec + 8,096,556,575 cycles # 2.897 GHz + 15,238,891,903 instructions # 1.88 insn per cycle + 2.804859872 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.644016e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.094854e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.094854e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.735992 sec + 6,623,617,660 cycles # 2.417 GHz + 12,843,079,376 instructions # 1.94 insn per cycle + 2.752411310 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052564145764E-002 +Relative difference = 1.9988585667912256e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index e01df148fd..fde060de72 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:33:18 +DATE: 2024-03-01_03:03:05 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.886573e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.209532e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.966862e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.389381 sec - 979,434,085 cycles:u # 2.383 GHz (75.04%) - 2,277,306 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.85%) - 4,390,330 stalled-cycles-backend:u # 0.45% backend cycles idle (73.83%) - 1,816,731,527 instructions:u # 1.85 insn per cycle - # 0.00 stalled cycles per insn (74.28%) - 0.437890631 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.224284e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181869e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.290244e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.576138 sec + 2,415,755,755 cycles # 3.001 GHz + 3,734,378,655 instructions # 1.55 insn per cycle + 0.864225849 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.929116e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.376620e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.376620e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.933142 sec - 13,173,596,053 cycles:u # 3.327 GHz (74.95%) - 38,281,114 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.95%) - 921,861,865 stalled-cycles-backend:u # 7.00% backend cycles idle (74.96%) - 38,070,756,048 instructions:u # 2.89 insn per cycle - # 0.02 stalled cycles per insn (74.97%) - 3.962633268 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.727035e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.251286e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.251286e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 4.002640 sec + 12,159,409,273 cycles # 3.035 GHz + 32,432,694,101 instructions # 2.67 insn per cycle + 4.008158303 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039543819614E-002 -Relative difference = 3.5561191488957804e-08 +Avg ME (F77/C++) = 1.2828039840314887E-002 +Relative difference = 1.244813035273009e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.477958e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.358625e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.358625e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.484051 sec - 8,104,443,603 cycles:u # 3.227 GHz (74.86%) - 41,987,704 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.86%) - 415,967,627 stalled-cycles-backend:u # 5.13% backend cycles idle (74.95%) - 18,682,517,136 instructions:u # 2.31 insn per cycle - # 0.02 stalled cycles per insn (75.12%) - 2.515458731 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.805511e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.765564e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.765564e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.601867 sec + 7,999,882,010 cycles # 3.069 GHz + 18,656,600,340 instructions # 2.33 insn per cycle + 2.607493343 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039283704129E-002 +Relative difference = 5.583829420356249e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.856113e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.000169e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.000169e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.306514 sec - 7,472,562,788 cycles:u # 3.202 GHz (75.03%) - 43,082,857 stalled-cycles-frontend:u # 0.58% frontend cycles idle (74.98%) - 1,358,463,269 stalled-cycles-backend:u # 18.18% backend cycles idle (74.98%) - 14,266,218,058 instructions:u # 1.91 insn per cycle - # 0.10 stalled cycles per insn (74.98%) - 2.337985808 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2233) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.939924e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.842069e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.842069e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.492780 sec + 7,427,313,914 cycles # 2.974 GHz + 14,251,086,474 instructions # 1.92 insn per cycle + 2.498394316 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053244447801E-002 +Relative difference = 2.5291823782248813e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.004272e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.034488e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.034488e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.444620 sec + 7,299,238,549 cycles # 2.980 GHz + 13,947,633,533 instructions # 1.91 insn per cycle + 2.450212772 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053337216261E-002 -Relative difference = 2.601499261602198e-07 +Avg ME (F77/C++) = 1.2828053244447801E-002 +Relative difference = 2.5291823782248813e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.706121e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.223606e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.223606e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.681955 sec + 6,492,318,128 cycles # 2.417 GHz + 13,422,094,611 instructions # 2.07 insn per cycle + 2.687432186 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052562326775E-002 +Relative difference = 1.997440588685788e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 685a4c5586..0d6d3b3db1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:33:40 +DATE: 2024-03-01_03:03:32 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=1, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.883051e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.242645e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.010106e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.372027e-02 +- 3.270769e-06 ) GeV^0 -TOTAL : 0.386398 sec - 972,456,357 cycles:u # 2.401 GHz (75.26%) - 2,356,298 stalled-cycles-frontend:u # 0.24% frontend cycles idle (73.40%) - 4,335,483 stalled-cycles-backend:u # 0.45% backend cycles idle (73.42%) - 1,793,751,584 instructions:u # 1.84 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 0.437765238 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.215876e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204111e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.337047e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.576922 sec + 2,404,705,116 cycles # 2.985 GHz + 3,758,296,111 instructions # 1.56 insn per cycle + 0.864210592 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.686654e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.635896e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.635896e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.015670 sec - 9,959,556,040 cycles:u # 3.273 GHz (74.96%) - 38,732,011 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.03%) - 31,573,728 stalled-cycles-backend:u # 0.32% backend cycles idle (75.03%) - 28,562,544,127 instructions:u # 2.87 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 3.046442990 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.296714e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.359904e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.359904e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.097656 sec + 9,472,450,742 cycles # 3.053 GHz + 25,268,175,697 instructions # 2.67 insn per cycle + 3.103042436 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039838495897E-002 +Relative difference = 1.2589928273811243e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.841583e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.313065e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.313065e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.316521 sec - 7,503,861,433 cycles:u # 3.202 GHz (74.81%) - 38,236,403 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.87%) - 30,212,837 stalled-cycles-backend:u # 0.40% backend cycles idle (75.04%) - 16,951,104,854 instructions:u # 2.26 insn per cycle - # 0.00 stalled cycles per insn (75.08%) - 2.347740758 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.079795e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.704088e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.704088e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.415041 sec + 7,164,638,851 cycles # 2.961 GHz + 16,869,197,703 instructions # 2.35 insn per cycle + 2.420723497 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.065196e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.526485e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.526485e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.223458 sec - 7,197,899,014 cycles:u # 3.198 GHz (74.79%) - 42,930,070 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.92%) - 349,598,703 stalled-cycles-backend:u # 4.86% backend cycles idle (75.09%) - 13,636,238,011 instructions:u # 1.89 insn per cycle - # 0.03 stalled cycles per insn (75.13%) - 2.254983022 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2064) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.078168e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.319472e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.319472e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.394138 sec + 7,165,321,711 cycles # 2.987 GHz + 13,616,190,038 instructions # 1.90 insn per cycle + 2.399577311 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053220800939E-002 +Relative difference = 2.5107486628541925e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.136069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.411751e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.411751e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.363661 sec + 7,031,964,685 cycles # 2.970 GHz + 13,425,613,371 instructions # 1.91 insn per cycle + 2.369281481 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053331759293E-002 -Relative difference = 2.597245327285885e-07 +Avg ME (F77/C++) = 1.2828053220800939E-002 +Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.811199e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.477443e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.477443e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.592425 sec + 6,321,858,831 cycles # 2.434 GHz + 13,153,560,775 instructions # 2.08 insn per cycle + 2.597985755 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052536860923E-002 +Relative difference = 1.977588895209662e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index a5f18d3b23..4be3e76490 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:03:32 +DATE: 2024-03-01_02:26:01 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.318987e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.111965e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.340348e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.504885 sec - 1,317,748,469 cycles:u # 2.492 GHz (74.25%) - 2,348,029 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.33%) - 4,933,092 stalled-cycles-backend:u # 0.37% backend cycles idle (75.11%) - 2,054,494,016 instructions:u # 1.56 insn per cycle - # 0.00 stalled cycles per insn (75.78%) - 0.560665634 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.449419e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.301374e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.190967e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.717219 sec + 2,841,227,385 cycles # 2.957 GHz + 4,430,504,412 instructions # 1.56 insn per cycle + 1.049815549 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590281E-002 -Relative difference = 7.67145406542181e-09 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.233768e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.407841e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.407841e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.830420 sec - 19,739,395,369 cycles:u # 3.367 GHz (75.02%) - 51,696,496 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.03%) - 75,656,656 stalled-cycles-backend:u # 0.38% backend cycles idle (75.03%) - 46,919,398,962 instructions:u # 2.38 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 5.864610691 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 473) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.109294e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.297854e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.297854e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.069129 sec + 18,728,354,553 cycles # 3.083 GHz + 44,224,513,518 instructions # 2.36 insn per cycle + 6.079869673 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.005878e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.551825e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.551825e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.863822 sec - 12,851,798,844 cycles:u # 3.300 GHz (74.97%) - 51,932,860 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.94%) - 1,589,021,560 stalled-cycles-backend:u # 12.36% backend cycles idle (74.96%) - 30,953,526,366 instructions:u # 2.41 insn per cycle - # 0.05 stalled cycles per insn (74.96%) - 3.898615324 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.745615e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.315952e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.315952e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.001256 sec + 12,323,242,096 cycles # 3.075 GHz + 30,917,838,115 instructions # 2.51 insn per cycle + 4.017904894 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.599539e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.429045e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.429045e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.150319 sec - 10,380,428,648 cycles:u # 3.263 GHz (74.89%) - 49,648,911 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.02%) - 872,379,532 stalled-cycles-backend:u # 8.40% backend cycles idle (75.11%) - 19,304,358,873 instructions:u # 1.86 insn per cycle - # 0.05 stalled cycles per insn (75.11%) - 3.185393340 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2101) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.078908e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.902249e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.902249e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.416443 sec + 10,120,877,504 cycles # 2.958 GHz + 19,374,733,180 instructions # 1.91 insn per cycle + 3.431641491 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.114347e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.979731e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.979731e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.374976 sec + 9,706,052,635 cycles # 2.871 GHz + 18,944,519,271 instructions # 1.95 insn per cycle + 3.395274500 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.874531e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.524823e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.524823e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.760847 sec + 8,409,257,244 cycles # 2.233 GHz + 15,057,436,319 instructions # 1.79 insn per cycle + 3.776930410 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 15421ace75..77001f8935 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-03_14:03:59 +DATE: 2024-03-01_02:26:35 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.936397e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.601934e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.922350e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371632e-02 +- 3.269165e-06 ) GeV^0 -TOTAL : 0.490032 sec - 1,245,563,795 cycles:u # 2.426 GHz (75.17%) - 2,192,930 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.24%) - 5,709,947 stalled-cycles-backend:u # 0.46% backend cycles idle (75.23%) - 2,036,787,970 instructions:u # 1.64 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 0.545663761 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.443987e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.284127e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.143740e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.699538 sec + 2,805,342,043 cycles # 2.999 GHz + 4,414,010,673 instructions # 1.57 insn per cycle + 1.020206687 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590284E-002 -Relative difference = 7.67145379496374e-09 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.313034e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.511779e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.511779e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.519982 sec - 18,669,891,707 cycles:u # 3.364 GHz (74.93%) - 51,510,445 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.92%) - 64,158,399 stalled-cycles-backend:u # 0.34% backend cycles idle (74.96%) - 44,665,149,420 instructions:u # 2.39 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 5.553260637 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 497) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.155620e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.358194e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.358194e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.837265 sec + 18,090,198,997 cycles # 3.097 GHz + 42,472,863,850 instructions # 2.35 insn per cycle + 5.848007644 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.037716e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.603868e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.603868e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.815384 sec - 12,695,824,770 cycles:u # 3.301 GHz (74.88%) - 51,368,917 stalled-cycles-frontend:u # 0.40% frontend cycles idle (74.95%) - 426,406,512 stalled-cycles-backend:u # 3.36% backend cycles idle (75.04%) - 30,185,380,945 instructions:u # 2.38 insn per cycle - # 0.01 stalled cycles per insn (75.04%) - 3.850199439 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.786116e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.385279e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.385279e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.920672 sec + 12,137,736,337 cycles # 3.092 GHz + 30,225,042,392 instructions # 2.49 insn per cycle + 3.938311189 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.623894e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.473036e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.473036e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.130926 sec - 10,287,378,182 cycles:u # 3.253 GHz (75.00%) - 46,386,917 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.96%) - 302,753,328 stalled-cycles-backend:u # 2.94% backend cycles idle (74.85%) - 18,963,256,356 instructions:u # 1.84 insn per cycle - # 0.02 stalled cycles per insn (74.85%) - 3.166792585 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2054) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.068049e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.882124e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.882124e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.437770 sec + 10,015,371,277 cycles # 2.909 GHz + 19,256,811,213 instructions # 1.92 insn per cycle + 3.454377757 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.207913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.137874e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.137874e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.235635 sec + 9,645,810,411 cycles # 2.976 GHz + 18,756,051,671 instructions # 1.94 insn per cycle + 3.251774736 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.969792e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.680976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.680976e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.592139 sec + 8,293,535,644 cycles # 2.305 GHz + 14,979,176,568 instructions # 1.81 insn per cycle + 3.613399615 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 241e4837ec..9a5df19d5b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:04:25 +DATE: 2024-03-01_02:27:08 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.792674e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.953494e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.007777e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.394758 sec - 923,327,966 cycles:u # 2.275 GHz (74.36%) - 2,293,308 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.05%) - 5,604,796 stalled-cycles-backend:u # 0.61% backend cycles idle (75.03%) - 1,453,147,578 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (74.42%) - 0.449959088 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.025930e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.135524e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271935e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.535145 sec + 2,303,454,226 cycles # 2.990 GHz + 3,249,200,622 instructions # 1.41 insn per cycle + 0.848848936 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.523861e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.590498e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.590498e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.334949 sec - 14,979,793,943 cycles:u # 3.431 GHz (74.91%) - 9,456,313 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.90%) - 1,486,955,679 stalled-cycles-backend:u # 9.93% backend cycles idle (74.97%) - 38,693,956,907 instructions:u # 2.58 insn per cycle - # 0.04 stalled cycles per insn (75.07%) - 4.369306063 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.185653e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.250591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.250591e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.903669 sec + 15,175,795,116 cycles # 3.093 GHz + 38,374,949,840 instructions # 2.53 insn per cycle + 4.917105673 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.498105e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.721418e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.721418e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.510166 sec - 8,578,472,900 cycles:u # 3.375 GHz (74.82%) - 9,889,349 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.93%) - 492,113,192 stalled-cycles-backend:u # 5.74% backend cycles idle (75.09%) - 24,326,612,305 instructions:u # 2.84 insn per cycle - # 0.02 stalled cycles per insn (75.14%) - 2.545870437 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.662249e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.860778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.860778e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.968890 sec + 9,101,848,873 cycles # 3.060 GHz + 24,578,505,710 instructions # 2.70 insn per cycle + 2.986159008 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.681670e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.274519e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.274519e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.544546 sec - 5,171,840,561 cycles:u # 3.281 GHz (74.92%) - 8,432,588 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.13%) - 975,939,486 stalled-cycles-backend:u # 18.87% backend cycles idle (75.14%) - 11,476,671,248 instructions:u # 2.22 insn per cycle - # 0.09 stalled cycles per insn (75.14%) - 1.581412468 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.728560e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.222175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.222175e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.936093 sec + 5,474,671,571 cycles # 2.819 GHz + 11,252,385,098 instructions # 2.06 insn per cycle + 1.954008279 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.292169e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.895497e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.895497e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.774092 sec + 4,972,729,611 cycles # 2.794 GHz + 10,557,445,760 instructions # 2.12 insn per cycle + 1.789622209 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.894024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.109310e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109310e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.799185 sec + 5,395,066,029 cycles # 1.924 GHz + 7,793,871,634 instructions # 1.44 insn per cycle + 2.817161041 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 7dbee0f112..598396a8e7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,175 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:52:41 +DATE: 2024-03-01_03:14:07 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.962044e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.790802e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.790802e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.237384 sec - 3,741,729,435 cycles:u # 2.943 GHz (75.04%) - 21,439,104 stalled-cycles-frontend:u # 0.57% frontend cycles idle (75.13%) - 1,158,531,216 stalled-cycles-backend:u # 30.96% backend cycles idle (74.85%) - 3,954,224,434 instructions:u # 1.06 insn per cycle - # 0.29 stalled cycles per insn (74.89%) - 1.301539646 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.569533e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.877038e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.877038e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.801549 sec + 3,157,604,220 cycles # 3.025 GHz + 4,827,294,021 instructions # 1.53 insn per cycle + 1.101037847 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.521503e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.586603e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.586603e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.416052 sec - 15,061,778,809 cycles:u # 3.380 GHz (74.87%) - 10,460,452 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.96%) - 1,458,789,729 stalled-cycles-backend:u # 9.69% backend cycles idle (75.05%) - 38,688,044,179 instructions:u # 2.57 insn per cycle - # 0.04 stalled cycles per insn (75.07%) - 4.459681433 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.171920e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.234476e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.234476e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.008942 sec + 15,497,351,856 cycles # 3.090 GHz + 38,433,512,801 instructions # 2.48 insn per cycle + 5.015755142 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.465557e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.686838e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.686838e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.613030 sec - 8,709,883,450 cycles:u # 3.282 GHz (75.01%) - 10,315,750 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.01%) - 509,887,377 stalled-cycles-backend:u # 5.85% backend cycles idle (74.99%) - 24,603,742,616 instructions:u # 2.82 insn per cycle - # 0.02 stalled cycles per insn (74.98%) - 2.658578687 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.610749e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.808660e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.808660e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.090616 sec + 9,430,020,802 cycles # 3.049 GHz + 24,763,068,407 instructions # 2.63 insn per cycle + 3.097621879 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.590705e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.168333e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.168333e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.657586 sec - 5,373,669,337 cycles:u # 3.157 GHz (74.71%) - 9,036,023 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.85%) - 958,756,649 stalled-cycles-backend:u # 17.84% backend cycles idle (75.09%) - 11,775,474,143 instructions:u # 2.19 insn per cycle - # 0.08 stalled cycles per insn (75.13%) - 1.706319579 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.825746e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.328246e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.328246e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.984017 sec + 5,826,620,771 cycles # 2.928 GHz + 11,538,062,844 instructions # 1.98 insn per cycle + 1.990946794 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.484023e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.101551e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.101551e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.799262 sec + 5,294,562,816 cycles # 2.933 GHz + 10,843,404,980 instructions # 2.05 insn per cycle + 1.806082483 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.045937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.276782e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.276782e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.778138 sec + 5,743,518,580 cycles # 2.063 GHz + 8,037,207,687 instructions # 1.40 insn per cycle + 2.785184310 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 265c701b52..977053e874 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,165 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_15:04:13 +DATE: 2024-03-01_03:27:17 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.699007e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.971161e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.025778e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.571348e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154956e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272098e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.067748 sec - 3,232,507,792 cycles:u # 2.948 GHz (74.39%) - 10,800,000 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.47%) - 1,140,820,368 stalled-cycles-backend:u # 35.29% backend cycles idle (74.75%) - 2,947,382,727 instructions:u # 0.91 insn per cycle - # 0.39 stalled cycles per insn (75.20%) - 1.121100120 seconds time elapsed +TOTAL : 0.617245 sec + 2,532,813,012 cycles # 2.999 GHz + 3,701,870,616 instructions # 1.46 insn per cycle + 0.904006340 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.525527e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.590693e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.590693e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.183394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.247420e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.247420e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.331199 sec - 14,981,765,080 cycles:u # 3.434 GHz (74.88%) - 9,200,653 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.91%) - 1,520,129,866 stalled-cycles-backend:u # 10.15% backend cycles idle (75.00%) - 38,690,269,362 instructions:u # 2.58 insn per cycle - # 0.04 stalled cycles per insn (75.06%) - 4.364994952 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.966854 sec + 15,343,121,883 cycles # 3.087 GHz + 38,390,661,623 instructions # 2.50 insn per cycle + 4.972403311 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.432769e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.649733e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.649733e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.599283e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.796561e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.796561e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.544461 sec - 8,704,350,159 cycles:u # 3.379 GHz (74.87%) - 10,338,759 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.85%) - 534,931,295 stalled-cycles-backend:u # 6.15% backend cycles idle (74.95%) - 24,372,274,267 instructions:u # 2.80 insn per cycle - # 0.02 stalled cycles per insn (75.11%) - 2.578255119 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.079495 sec + 9,279,730,828 cycles # 3.010 GHz + 24,577,932,954 instructions # 2.65 insn per cycle + 3.085060857 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.677967e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.270492e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.270492e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.908259e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.435116e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.435116e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.545617 sec - 5,214,954,602 cycles:u # 3.307 GHz (74.64%) - 9,315,822 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.81%) - 972,093,360 stalled-cycles-backend:u # 18.64% backend cycles idle (75.06%) - 11,473,357,235 instructions:u # 2.20 insn per cycle - # 0.08 stalled cycles per insn (75.15%) - 1.579355171 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +TOTAL : 1.937503 sec + 5,654,473,993 cycles # 2.911 GHz + 11,233,989,199 instructions # 1.99 insn per cycle + 1.943141738 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.578665e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.217153e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.217153e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.757396 sec + 5,128,637,723 cycles # 2.910 GHz + 10,505,547,256 instructions # 2.05 insn per cycle + 1.762900213 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.070979e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.306684e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.306684e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.739915 sec + 5,558,468,681 cycles # 2.025 GHz + 7,741,606,815 instructions # 1.39 insn per cycle + 2.745378653 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index e5dc219326..e5cfc13b3e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,169 +1,212 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_15:00:28 +DATE: 2024-03-01_03:20:45 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.817032e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.967328e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.022046e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.183136 sec - 3,634,374,116 cycles:u # 2.990 GHz (75.05%) - 21,461,083 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.05%) - 1,143,444,081 stalled-cycles-backend:u # 31.46% backend cycles idle (75.00%) - 3,851,624,402 instructions:u # 1.06 insn per cycle - # 0.30 stalled cycles per insn (74.95%) - 1.233526426 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.972409e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155179e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272541e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.697938 sec + 2,798,675,219 cycles # 3.021 GHz + 4,376,672,842 instructions # 1.56 insn per cycle + 0.983897382 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.525556e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.591135e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.591135e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.330569 sec - 14,984,543,242 cycles:u # 3.435 GHz (74.88%) - 9,289,687 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.91%) - 1,446,915,645 stalled-cycles-backend:u # 9.66% backend cycles idle (75.00%) - 38,680,440,371 instructions:u # 2.58 insn per cycle - # 0.04 stalled cycles per insn (75.06%) - 4.364535873 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.189575e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.254386e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.254386e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.893907 sec + 15,162,024,600 cycles # 3.096 GHz + 38,372,989,497 instructions # 2.53 insn per cycle + 4.899450957 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515645 +Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.489599e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.715001e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.715001e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.514792 sec - 8,607,437,165 cycles:u # 3.381 GHz (74.72%) - 9,913,991 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.87%) - 505,052,878 stalled-cycles-backend:u # 5.87% backend cycles idle (75.02%) - 24,369,233,220 instructions:u # 2.83 insn per cycle - # 0.02 stalled cycles per insn (75.17%) - 2.548468659 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.704548e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.907149e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.907149e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.935182 sec + 9,091,941,153 cycles # 3.094 GHz + 24,577,519,112 instructions # 2.70 insn per cycle + 2.940777194 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.678183e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.266961e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.266961e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.545730 sec - 5,218,044,328 cycles:u # 3.309 GHz (74.65%) - 9,304,576 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.78%) - 959,781,315 stalled-cycles-backend:u # 18.39% backend cycles idle (75.03%) - 11,480,727,731 instructions:u # 2.20 insn per cycle - # 0.08 stalled cycles per insn (75.15%) - 1.579407077 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.938740e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.466662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.466662e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.869802 sec + 5,458,289,042 cycles # 2.911 GHz + 11,250,961,339 instructions # 2.06 insn per cycle + 1.875881825 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.493369e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.117845e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.117845e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.719311 sec + 5,034,836,824 cycles # 2.920 GHz + 10,558,271,294 instructions # 2.10 insn per cycle + 1.725057980 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.013824e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.247297e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.247297e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.716839 sec + 5,403,556,568 cycles # 1.987 GHz + 7,794,191,095 instructions # 1.44 insn per cycle + 2.722528243 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index a9d6608bd7..73356b00dd 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:04:47 +DATE: 2024-03-01_02:27:35 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.841177e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.927044e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.980188e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.377611 sec - 871,922,050 cycles:u # 2.173 GHz (74.03%) - 2,265,714 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.12%) - 5,115,843 stalled-cycles-backend:u # 0.59% backend cycles idle (74.98%) - 1,365,035,303 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (75.93%) - 0.429796399 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.058566e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.139903e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277694e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.538743 sec + 2,297,794,086 cycles # 2.963 GHz + 3,276,125,304 instructions # 1.43 insn per cycle + 0.856267333 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.447310e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.508276e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.508276e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.463871 sec - 15,425,766,743 cycles:u # 3.431 GHz (74.93%) - 9,099,706 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.92%) - 26,594,098 stalled-cycles-backend:u # 0.17% backend cycles idle (74.94%) - 39,551,830,467 instructions:u # 2.56 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 4.499242437 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.197217e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.262307e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.262307e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.877526 sec + 15,081,677,651 cycles # 3.089 GHz + 40,100,660,385 instructions # 2.66 insn per cycle + 4.889980594 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.398083e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.614446e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.614446e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.562355 sec - 8,749,116,043 cycles:u # 3.373 GHz (74.99%) - 10,470,436 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.02%) - 1,497,401,215 stalled-cycles-backend:u # 17.11% backend cycles idle (75.02%) - 23,520,129,083 instructions:u # 2.69 insn per cycle - # 0.06 stalled cycles per insn (75.02%) - 2.598149109 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1952) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.910252e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.135599e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.135599e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.787478 sec + 8,606,981,244 cycles # 3.082 GHz + 23,670,854,000 instructions # 2.75 insn per cycle + 2.801213189 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.931928e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.410277e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.410277e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.693161 sec - 5,687,334,776 cycles:u # 3.297 GHz (74.96%) - 9,853,745 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.96%) - 752,383,950 stalled-cycles-backend:u # 13.23% backend cycles idle (75.00%) - 13,198,663,716 instructions:u # 2.32 insn per cycle - # 0.06 stalled cycles per insn (75.00%) - 1.729123364 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.287623e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.696089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.696089e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.088271 sec + 6,101,163,180 cycles # 2.915 GHz + 13,060,965,379 instructions # 2.14 insn per cycle + 2.110411764 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.510708e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.955656e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.955656e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.007458 sec + 5,795,313,103 cycles # 2.878 GHz + 12,320,114,352 instructions # 2.13 insn per cycle + 2.035740422 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.559784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.746127e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.746127e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.054998 sec + 5,836,990,709 cycles # 1.908 GHz + 9,601,704,067 instructions # 1.64 insn per cycle + 3.069883688 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index bf0bdd420a..7ca7ca6f27 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:34:01 +DATE: 2024-03-01_03:03:58 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.824404e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.962298e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.016640e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.378356 sec - 915,619,025 cycles:u # 2.271 GHz (75.19%) - 2,407,774 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.60%) - 5,389,487 stalled-cycles-backend:u # 0.59% backend cycles idle (73.96%) - 1,476,869,829 instructions:u # 1.61 insn per cycle - # 0.00 stalled cycles per insn (74.26%) - 0.433727402 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.566149e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.156976e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274435e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.520509 sec + 2,251,864,611 cycles # 2.979 GHz + 3,200,076,053 instructions # 1.42 insn per cycle + 0.813049887 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.883473e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.970155e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.970155e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.817212 sec - 13,160,873,635 cycles:u # 3.419 GHz (74.86%) - 9,243,267 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.97%) - 387,509,790 stalled-cycles-backend:u # 2.94% backend cycles idle (75.06%) - 35,783,766,947 instructions:u # 2.72 insn per cycle - # 0.01 stalled cycles per insn (75.06%) - 3.851933365 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.538728e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.625778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.625778e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.235724 sec + 13,018,811,907 cycles # 3.070 GHz + 34,384,492,801 instructions # 2.64 insn per cycle + 4.241723051 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.431790e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.652319e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.652319e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.545488 sec - 8,688,076,238 cycles:u # 3.371 GHz (74.90%) - 9,841,654 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.89%) - 2,450,051,747 stalled-cycles-backend:u # 28.20% backend cycles idle (74.86%) - 21,908,081,719 instructions:u # 2.52 insn per cycle - # 0.11 stalled cycles per insn (74.97%) - 2.581714908 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.065411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.209741e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.209741e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.527791 sec + 10,618,068,276 cycles # 3.005 GHz + 24,006,297,751 instructions # 2.26 insn per cycle + 3.533644608 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.817650e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.279825e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.279825e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.718734 sec - 5,777,821,529 cycles:u # 3.300 GHz (74.90%) - 8,837,696 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.91%) - 1,463,068,323 stalled-cycles-backend:u # 25.32% backend cycles idle (74.89%) - 12,086,003,846 instructions:u # 2.09 insn per cycle - # 0.12 stalled cycles per insn (74.89%) - 1.755239088 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3046) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.845204e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.186466e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.186466e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.268558 sec + 6,594,099,256 cycles # 2.900 GHz + 12,400,446,525 instructions # 1.88 insn per cycle + 2.274329127 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 +Avg ME (F77/C++) = 2.0288063388516200 +Relative difference = 3.2588037208240405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.148118e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.537652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.537652e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.142175 sec + 6,250,159,272 cycles # 2.911 GHz + 11,574,474,977 instructions # 1.85 insn per cycle + 2.148019416 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516200 +Relative difference = 3.2588037208240405e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.139590e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.381511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.381511e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.637824 sec + 5,343,225,675 cycles # 2.022 GHz + 9,294,792,947 instructions # 1.74 insn per cycle + 2.643638198 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index c1d50feda3..6740b658ab 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:34:23 +DATE: 2024-03-01_03:04:25 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.845504e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.929444e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.982727e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.376878 sec - 859,889,632 cycles:u # 2.142 GHz (74.88%) - 2,101,172 stalled-cycles-frontend:u # 0.24% frontend cycles idle (76.07%) - 4,913,440 stalled-cycles-backend:u # 0.57% backend cycles idle (75.14%) - 1,408,843,301 instructions:u # 1.64 insn per cycle - # 0.00 stalled cycles per insn (74.34%) - 0.429973828 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.563128e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158314e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275634e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.525125 sec + 2,266,508,632 cycles # 2.999 GHz + 3,227,683,893 instructions # 1.42 insn per cycle + 0.815560561 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.237547e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.346977e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.346977e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.418474 sec - 11,732,063,016 cycles:u # 3.401 GHz (74.96%) - 9,422,894 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.98%) - 20,017,604 stalled-cycles-backend:u # 0.17% backend cycles idle (74.97%) - 35,763,295,048 instructions:u # 3.05 insn per cycle - # 0.00 stalled cycles per insn (74.96%) - 3.453721586 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.686393e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.784184e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.784184e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.008193 sec + 12,350,315,150 cycles # 3.077 GHz + 35,037,181,267 instructions # 2.84 insn per cycle + 4.014100641 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.827548e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.089939e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.089939e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.352011 sec - 8,010,475,735 cycles:u # 3.360 GHz (74.85%) - 9,398,588 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.02%) - 1,674,206,966 stalled-cycles-backend:u # 20.90% backend cycles idle (75.18%) - 21,206,260,334 instructions:u # 2.65 insn per cycle - # 0.08 stalled cycles per insn (75.18%) - 2.388527408 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.126314e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.271590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.271590e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.458899 sec + 10,688,048,117 cycles # 3.085 GHz + 23,082,662,787 instructions # 2.16 insn per cycle + 3.464737128 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.596714e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.178513e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.178513e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.560107 sec - 5,213,460,094 cycles:u # 3.273 GHz (74.92%) - 9,518,486 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.93%) - 731,122,291 stalled-cycles-backend:u # 14.02% backend cycles idle (74.90%) - 11,424,220,646 instructions:u # 2.19 insn per cycle - # 0.06 stalled cycles per insn (74.92%) - 1.601201132 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2354) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.065386e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.447820e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.447820e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.175532 sec + 6,167,789,524 cycles # 2.829 GHz + 11,956,365,830 instructions # 1.94 insn per cycle + 2.181490352 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.355284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.776167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.776167e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.062589 sec + 6,012,687,929 cycles # 2.908 GHz + 11,129,506,913 instructions # 1.85 insn per cycle + 2.068524285 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.234665e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.489644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.489644e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.581777 sec + 5,215,223,845 cycles # 2.016 GHz + 9,019,923,506 instructions # 1.73 insn per cycle + 2.587755549 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index ec9c2640e1..3164378b7a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:05:09 +DATE: 2024-03-01_02:28:04 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.576605e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.946758e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.111159e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.324622 sec - 772,289,751 cycles:u # 2.244 GHz (73.84%) - 2,245,044 stalled-cycles-frontend:u # 0.29% frontend cycles idle (74.82%) - 4,522,934 stalled-cycles-backend:u # 0.59% backend cycles idle (75.00%) - 1,308,126,073 instructions:u # 1.69 insn per cycle - # 0.00 stalled cycles per insn (75.21%) - 0.373595997 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.210726e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.585567e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.966482e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.485254 sec + 2,068,141,298 cycles # 2.904 GHz + 2,916,142,359 instructions # 1.41 insn per cycle + 0.784434250 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.996944e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.087796e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.087796e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.642357 sec - 12,627,649,362 cycles:u # 3.440 GHz (74.94%) - 7,521,215 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.85%) - 954,432,603 stalled-cycles-backend:u # 7.56% backend cycles idle (74.85%) - 37,135,203,641 instructions:u # 2.94 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 3.672981701 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.313091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.389644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.389644e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.621612 sec + 14,026,409,554 cycles # 3.032 GHz + 38,341,238,705 instructions # 2.73 insn per cycle + 4.632085783 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.308088e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.778711e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.778711e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.805701 sec - 6,218,278,603 cycles:u # 3.393 GHz (74.62%) - 4,713,384 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.93%) - 1,929,137,521 stalled-cycles-backend:u # 31.02% backend cycles idle (75.13%) - 15,189,730,910 instructions:u # 2.44 insn per cycle - # 0.13 stalled cycles per insn (75.13%) - 1.837265834 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.217740e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.647077e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.647077e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.094155 sec + 6,477,656,873 cycles # 3.085 GHz + 15,815,714,256 instructions # 2.44 insn per cycle + 2.109661469 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.222825e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.379981e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.379981e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.000951 sec - 3,394,012,608 cycles:u # 3.300 GHz (74.63%) - 8,001,044 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.88%) - 1,047,207,876 stalled-cycles-backend:u # 30.85% backend cycles idle (75.11%) - 7,668,224,582 instructions:u # 2.26 insn per cycle - # 0.14 stalled cycles per insn (75.11%) - 1.032065308 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.558089e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.098648e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.098648e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.180439 sec + 3,464,791,228 cycles # 2.924 GHz + 7,594,553,534 instructions # 2.19 insn per cycle + 1.196926932 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.028669e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.195924e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.195924e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.103361 sec + 3,253,544,502 cycles # 2.935 GHz + 7,202,500,133 instructions # 2.21 insn per cycle + 1.115792553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.586127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.450667e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.450667e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.467307 sec + 3,062,229,633 cycles # 2.079 GHz + 5,834,823,887 instructions # 1.91 insn per cycle + 1.480044473 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 2b9a9d6e5c..b32abcb3fe 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,175 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:53:05 +DATE: 2024-03-01_03:14:35 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.457170e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.070824e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.070824e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.158557 sec - 3,553,635,129 cycles:u # 2.996 GHz (74.66%) - 21,028,695 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.10%) - 1,107,067,719 stalled-cycles-backend:u # 31.15% backend cycles idle (75.11%) - 3,881,698,330 instructions:u # 1.09 insn per cycle - # 0.29 stalled cycles per insn (75.01%) - 1.213936904 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.139226e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.486374e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.486374e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.665285 sec + 2,679,931,908 cycles # 3.001 GHz + 4,173,181,221 instructions # 1.56 insn per cycle + 0.950193790 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.995627e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.086783e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.086783e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.683431 sec - 12,659,259,788 cycles:u # 3.407 GHz (75.03%) - 7,459,048 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.03%) - 1,249,468,881 stalled-cycles-backend:u # 9.87% backend cycles idle (75.03%) - 37,093,663,641 instructions:u # 2.93 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 3.717841479 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.339175e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.415593e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.415593e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.608146 sec + 14,198,803,048 cycles # 3.078 GHz + 38,383,841,480 instructions # 2.70 insn per cycle + 4.614561058 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.311512e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.745050e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.745050e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.850834 sec - 6,221,194,518 cycles:u # 3.304 GHz (74.94%) - 7,340,758 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.97%) - 1,907,140,469 stalled-cycles-backend:u # 30.66% backend cycles idle (74.98%) - 15,554,035,131 instructions:u # 2.50 insn per cycle - # 0.12 stalled cycles per insn (74.95%) - 1.887022954 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.150361e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.574288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.574288e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.164951 sec + 6,682,648,138 cycles # 3.079 GHz + 16,095,511,662 instructions # 2.41 insn per cycle + 2.171478460 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.209800e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.363451e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.363451e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.055288 sec - 3,424,597,828 cycles:u # 3.148 GHz (75.00%) - 7,657,748 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.00%) - 1,064,027,342 stalled-cycles-backend:u # 31.07% backend cycles idle (75.06%) - 7,991,108,228 instructions:u # 2.33 insn per cycle - # 0.13 stalled cycles per insn (75.05%) - 1.091443231 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.377335e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.075060e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.075060e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.245724 sec + 3,655,872,382 cycles # 2.921 GHz + 7,830,960,228 instructions # 2.14 insn per cycle + 1.252058919 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.884024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.146718e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.146718e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.193275 sec + 3,439,455,837 cycles # 2.869 GHz + 7,440,735,686 instructions # 2.16 insn per cycle + 1.199824293 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.445766e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.274506e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.274506e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.539244 sec + 3,276,504,779 cycles # 2.121 GHz + 6,089,433,455 instructions # 1.86 insn per cycle + 1.545785864 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index a3128f7500..1418229a2f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,165 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_15:04:36 +DATE: 2024-03-01_03:27:44 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.851869e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.945347e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.107629e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.014251 sec - 3,108,031,718 cycles:u # 2.996 GHz (74.99%) - 10,712,388 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.43%) - 1,156,197,374 stalled-cycles-backend:u # 37.20% backend cycles idle (75.34%) - 2,767,536,694 instructions:u # 0.89 insn per cycle - # 0.42 stalled cycles per insn (74.96%) - 1.060791796 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.472574e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.636713e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.962164e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 +TOTAL : 0.558880 sec + 2,364,095,478 cycles # 3.003 GHz + 3,484,344,192 instructions # 1.47 insn per cycle + 0.845198156 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.995904e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.086251e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.086251e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.358072e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.436073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.436073e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.644384 sec - 12,634,503,228 cycles:u # 3.441 GHz (74.95%) - 7,148,826 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.94%) - 1,182,578,195 stalled-cycles-backend:u # 9.36% backend cycles idle (74.85%) - 37,144,398,934 instructions:u # 2.94 insn per cycle - # 0.03 stalled cycles per insn (74.86%) - 3.673697809 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.585598 sec + 14,172,267,813 cycles # 3.088 GHz + 38,370,669,897 instructions # 2.71 insn per cycle + 4.590984697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.142222e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.546841e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.546841e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.850729 sec - 6,338,703,028 cycles:u # 3.376 GHz (74.86%) - 7,177,415 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.68%) - 2,026,441,941 stalled-cycles-backend:u # 31.97% backend cycles idle (74.73%) - 15,242,487,970 instructions:u # 2.40 insn per cycle - # 0.13 stalled cycles per insn (75.13%) - 1.879541975 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.211957e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.640936e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.640936e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 2.148796 sec + 6,634,619,629 cycles # 3.081 GHz + 15,827,825,218 instructions # 2.39 insn per cycle + 2.154083020 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.223659e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.380802e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.380802e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.999997 sec - 3,369,007,067 cycles:u # 3.281 GHz (75.08%) - 7,192,520 stalled-cycles-frontend:u # 0.21% frontend cycles idle (75.08%) - 1,049,602,344 stalled-cycles-backend:u # 31.15% backend cycles idle (75.08%) - 7,677,145,813 instructions:u # 2.28 insn per cycle - # 0.14 stalled cycles per insn (74.70%) - 1.028886621 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.547921e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095970e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.095970e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.236002 sec + 3,624,228,310 cycles # 2.921 GHz + 7,577,923,207 instructions # 2.09 insn per cycle + 1.241371528 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.019099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183109e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.183109e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.166800 sec + 3,412,475,771 cycles # 2.913 GHz + 7,154,107,852 instructions # 2.10 insn per cycle + 1.172143118 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.590832e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.447342e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.447342e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.519807 sec + 3,228,336,001 cycles # 2.118 GHz + 5,784,936,071 instructions # 1.79 insn per cycle + 1.525231071 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 149b294a79..d1c301e36a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,169 +1,212 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_15:00:51 +DATE: 2024-03-01_03:21:13 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.243357e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.926144e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.087434e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.161597 sec - 3,601,612,751 cycles:u # 3.026 GHz (75.13%) - 21,057,800 stalled-cycles-frontend:u # 0.58% frontend cycles idle (75.19%) - 1,172,309,632 stalled-cycles-backend:u # 32.55% backend cycles idle (75.19%) - 3,776,445,937 instructions:u # 1.05 insn per cycle - # 0.31 stalled cycles per insn (75.22%) - 1.209122949 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.521212e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.620937e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.942141e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.625420 sec + 2,414,961,393 cycles # 2.854 GHz + 3,791,061,685 instructions # 1.57 insn per cycle + 0.904442863 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.998981e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.089526e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.089526e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.639809 sec - 12,626,040,265 cycles:u # 3.443 GHz (74.92%) - 7,470,036 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.93%) - 803,774,158 stalled-cycles-backend:u # 6.37% backend cycles idle (74.93%) - 37,156,129,537 instructions:u # 2.94 insn per cycle - # 0.02 stalled cycles per insn (74.94%) - 3.669673012 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.328946e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.404018e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.404018e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.586154 sec + 14,183,213,679 cycles # 3.090 GHz + 38,341,040,102 instructions # 2.70 insn per cycle + 4.591510537 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288199022179469 +Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.135844e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.545353e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.545353e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.852475 sec - 6,354,804,966 cycles:u # 3.381 GHz (74.89%) - 7,176,488 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.89%) - 2,019,535,309 stalled-cycles-backend:u # 31.78% backend cycles idle (74.90%) - 15,230,071,652 instructions:u # 2.40 insn per cycle - # 0.13 stalled cycles per insn (74.92%) - 1.881704419 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.242078e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.670922e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.670922e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.084805 sec + 6,467,654,599 cycles # 3.095 GHz + 15,814,952,627 instructions # 2.45 insn per cycle + 2.090234852 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.222623e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.379871e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.379871e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.001076 sec - 3,380,607,908 cycles:u # 3.289 GHz (74.76%) - 7,784,429 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.10%) - 1,049,808,694 stalled-cycles-backend:u # 31.05% backend cycles idle (75.10%) - 7,654,634,643 instructions:u # 2.26 insn per cycle - # 0.14 stalled cycles per insn (75.11%) - 1.029875459 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.553311e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.096092e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.096092e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.181028 sec + 3,453,301,700 cycles # 2.913 GHz + 7,593,575,205 instructions # 2.20 insn per cycle + 1.186225517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.023252e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188398e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188398e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.108864 sec + 3,247,038,827 cycles # 2.916 GHz + 7,202,168,264 instructions # 2.22 insn per cycle + 1.114391762 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181684445590 +Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.596256e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.449431e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.449431e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.464294 sec + 3,059,603,183 cycles # 2.083 GHz + 5,833,854,527 instructions # 1.91 insn per cycle + 1.469681735 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183349184692 +Relative difference = 1.6508058850146622e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 913244345c..adc2ed2114 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:05:30 +DATE: 2024-03-01_02:28:27 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.481980e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.117565e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.300799e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.323751 sec - 784,351,516 cycles:u # 2.273 GHz (74.32%) - 2,192,770 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.45%) - 4,264,005 stalled-cycles-backend:u # 0.54% backend cycles idle (75.47%) - 1,367,200,629 instructions:u # 1.74 insn per cycle - # 0.00 stalled cycles per insn (74.27%) - 0.376841355 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.323457e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.629602e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.019308e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.480923 sec + 2,116,431,851 cycles # 3.003 GHz + 3,022,655,895 instructions # 1.43 insn per cycle + 0.777218279 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.982273e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.072748e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.072748e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.660343 sec - 12,707,203,346 cycles:u # 3.446 GHz (74.89%) - 7,110,827 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.00%) - 11,876,317 stalled-cycles-backend:u # 0.09% backend cycles idle (75.05%) - 37,479,190,658 instructions:u # 2.95 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 3.690309537 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.299655e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.373045e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.373045e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.644587 sec + 14,360,257,758 cycles # 3.089 GHz + 39,833,716,550 instructions # 2.77 insn per cycle + 4.652300252 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288199028000236 +Relative difference = 4.790961076489297e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.302120e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.884122e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.884122e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.579376 sec - 5,415,260,274 cycles:u # 3.370 GHz (74.78%) - 7,723,910 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.03%) - 1,444,796,820 stalled-cycles-backend:u # 26.68% backend cycles idle (75.11%) - 15,194,831,822 instructions:u # 2.81 insn per cycle - # 0.10 stalled cycles per insn (75.12%) - 1.610679733 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.819246e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.374211e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.374211e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.888755 sec + 5,601,188,109 cycles # 2.957 GHz + 15,285,931,975 instructions # 2.73 insn per cycle + 1.901754882 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193548331037 +Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.906569e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.710449e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.710449e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.320165 sec - 4,487,451,290 cycles:u # 3.332 GHz (75.15%) - 7,209,142 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.10%) - 1,668,263,971 stalled-cycles-backend:u # 37.18% backend cycles idle (75.06%) - 9,805,752,185 instructions:u # 2.19 insn per cycle - # 0.17 stalled cycles per insn (74.77%) - 1.351069736 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3734) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.809980e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.511061e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.511061e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.623137 sec + 4,755,173,593 cycles # 2.919 GHz + 9,735,141,159 instructions # 2.05 insn per cycle + 1.639641207 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186428369954 -Relative difference = 1.7604478492421832e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288182108197361 +Relative difference = 1.0391259163456515e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.976796e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.708401e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.708401e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.586631 sec + 4,632,931,570 cycles # 2.912 GHz + 9,326,747,974 instructions # 2.01 insn per cycle + 1.599475417 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288182108197361 +Relative difference = 1.0391259163456515e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.246902e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.812329e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.812329e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.762945 sec + 3,668,593,409 cycles # 2.074 GHz + 7,034,535,336 instructions # 1.92 insn per cycle + 1.779301540 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2605) (512y: 12) (512z: 2221) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183459779248 +Relative difference = 1.7053177021099307e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index b6180da33a..82aee2242c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:34:44 +DATE: 2024-03-01_03:04:53 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.326362e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.955338e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.118530e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.325621 sec - 779,266,234 cycles:u # 2.250 GHz (74.75%) - 2,296,626 stalled-cycles-frontend:u # 0.29% frontend cycles idle (73.21%) - 4,232,127 stalled-cycles-backend:u # 0.54% backend cycles idle (73.21%) - 1,288,380,090 instructions:u # 1.65 insn per cycle - # 0.00 stalled cycles per insn (75.34%) - 0.377748048 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.193238e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.649659e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.969705e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.478757 sec + 2,104,839,063 cycles # 2.996 GHz + 2,995,662,279 instructions # 1.42 insn per cycle + 0.760483148 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.213545e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.317700e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.317700e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.406405 sec - 11,820,211,699 cycles:u # 3.442 GHz (74.86%) - 6,860,688 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.93%) - 1,730,547,263 stalled-cycles-backend:u # 14.64% backend cycles idle (75.04%) - 34,233,034,389 instructions:u # 2.90 insn per cycle - # 0.05 stalled cycles per insn (75.08%) - 3.436615248 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.482809e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.574079e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.574079e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.311067 sec + 12,598,770,011 cycles # 2.919 GHz + 34,372,549,657 instructions # 2.73 insn per cycle + 4.316594695 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199088536203 -Relative difference = 4.4925808981097166e-08 +Avg ME (F77/C++) = 2.0288199094356969 +Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.214009e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.786074e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.786074e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.597296 sec - 5,453,738,916 cycles:u # 3.356 GHz (74.89%) - 7,219,487 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.90%) - 2,066,216,281 stalled-cycles-backend:u # 37.89% backend cycles idle (74.69%) - 14,675,976,525 instructions:u # 2.69 insn per cycle - # 0.14 stalled cycles per insn (74.74%) - 1.628491271 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.536780e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.027176e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.027176e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.978899 sec + 6,105,197,866 cycles # 3.078 GHz + 14,859,942,037 instructions # 2.43 insn per cycle + 1.984598314 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198769558221 -Relative difference = 6.06481491495597e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193803280592 +Relative difference = 1.8746278463897685e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.357766e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.025050e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.025050e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.264737 sec - 4,300,837,383 cycles:u # 3.327 GHz (74.68%) - 7,671,605 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.68%) - 1,661,680,380 stalled-cycles-backend:u # 38.64% backend cycles idle (74.98%) - 9,050,907,731 instructions:u # 2.10 insn per cycle - # 0.18 stalled cycles per insn (75.25%) - 1.296250495 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4485) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.439196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.305375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.305375e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.494763 sec + 4,316,279,907 cycles # 2.878 GHz + 9,028,948,283 instructions # 2.09 insn per cycle + 1.500523975 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186752004549 -Relative difference = 1.6009291367898262e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181999931112 +Relative difference = 9.857617164523888e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.366245e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.235578e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.235578e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.509333 sec + 4,207,142,397 cycles # 2.778 GHz + 8,663,183,236 instructions # 2.06 insn per cycle + 1.515104262 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288181999931112 +Relative difference = 9.857617164523888e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.816959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.308753e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.308753e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.886655 sec + 3,832,564,290 cycles # 2.026 GHz + 7,807,000,610 instructions # 2.04 insn per cycle + 1.892395760 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4419) (512y: 0) (512z: 2556) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183246739209 +Relative difference = 1.6003107281264138e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index a98059d056..dda1db1b3c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:35:04 +DATE: 2024-03-01_03:05:16 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.680623e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.114696e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.297874e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.078079e+00 +- 3.394933e-03 ) GeV^0 -TOTAL : 0.322860 sec - 733,848,024 cycles:u # 2.130 GHz (75.93%) - 1,991,309 stalled-cycles-frontend:u # 0.27% frontend cycles idle (76.65%) - 4,436,940 stalled-cycles-backend:u # 0.60% backend cycles idle (74.74%) - 1,320,496,964 instructions:u # 1.80 insn per cycle - # 0.00 stalled cycles per insn (74.03%) - 0.374386065 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.270822e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.690662e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.026451e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.478497 sec + 2,092,584,267 cycles # 2.987 GHz + 2,982,481,806 instructions # 1.43 insn per cycle + 0.759974164 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.490256e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.613448e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.613448e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.147954 sec - 10,917,586,256 cycles:u # 3.438 GHz (74.84%) - 6,884,856 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.96%) - 125,728,239 stalled-cycles-backend:u # 1.15% backend cycles idle (75.06%) - 35,430,594,123 instructions:u # 3.25 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 3.178248752 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.703982e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.806761e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.806761e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 3.962914 sec + 11,745,545,496 cycles # 2.960 GHz + 35,108,793,810 instructions # 2.99 insn per cycle + 3.968579892 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199088536203 -Relative difference = 4.4925808981097166e-08 +Avg ME (F77/C++) = 2.0288199094356969 +Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.802344e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.483665e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.483665e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.486921 sec - 5,089,368,535 cycles:u # 3.360 GHz (74.69%) - 7,103,657 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.84%) - 1,342,600,549 stalled-cycles-backend:u # 26.38% backend cycles idle (75.11%) - 14,067,345,434 instructions:u # 2.76 insn per cycle - # 0.10 stalled cycles per insn (75.18%) - 1.518672718 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.697555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.224866e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.224866e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.925244 sec + 5,962,598,726 cycles # 3.089 GHz + 14,469,931,867 instructions # 2.43 insn per cycle + 1.931094914 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198892958462 -Relative difference = 5.4565783974899003e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288193583255634 +Relative difference = 1.7661780742548925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.023520e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.131303e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.131303e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.168445 sec - 3,961,187,521 cycles:u # 3.311 GHz (74.64%) - 7,048,970 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.73%) - 1,467,403,847 stalled-cycles-backend:u # 37.04% backend cycles idle (75.06%) - 8,611,214,037 instructions:u # 2.17 insn per cycle - # 0.17 stalled cycles per insn (75.27%) - 1.200246018 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3406) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.546151e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.447291e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.447291e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.475701 sec + 4,155,772,808 cycles # 2.809 GHz + 8,874,967,057 instructions # 2.14 insn per cycle + 1.481449825 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186836987734 -Relative difference = 1.559041129563128e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288182107033208 +Relative difference = 1.0385521077446488e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.932743e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.882289e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.882289e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.405788 sec + 4,123,527,517 cycles # 2.923 GHz + 8,411,119,259 instructions # 2.04 insn per cycle + 1.411551419 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288182107033208 +Relative difference = 1.0385521077446488e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.930692e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.444813e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.444813e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.851731 sec + 3,787,634,254 cycles # 2.040 GHz + 7,699,934,932 instructions # 2.03 insn per cycle + 1.857323010 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3435) (512y: 0) (512z: 2108) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183204829693 +Relative difference = 1.5796536184903122e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 7906350f50..9748a5aab4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:05:50 +DATE: 2024-03-01_02:28:51 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.858100e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.024750e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.080805e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.376752 sec - 893,024,523 cycles:u # 2.224 GHz (74.23%) - 2,244,421 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.02%) - 5,188,348 stalled-cycles-backend:u # 0.58% backend cycles idle (76.10%) - 1,372,887,375 instructions:u # 1.54 insn per cycle - # 0.00 stalled cycles per insn (75.13%) - 0.429724861 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.029545e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.136839e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273391e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.526886 sec + 2,307,341,508 cycles # 3.024 GHz + 3,271,429,537 instructions # 1.42 insn per cycle + 0.836809323 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243869 -Relative difference = 3.241686434838304e-07 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.465279e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.528277e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.528277e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.433247 sec - 15,327,724,629 cycles:u # 3.433 GHz (74.93%) - 9,891,067 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.92%) - 1,296,374,959 stalled-cycles-backend:u # 8.46% backend cycles idle (74.92%) - 39,274,027,041 instructions:u # 2.56 insn per cycle - # 0.03 stalled cycles per insn (75.02%) - 4.467582066 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.174399e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.238464e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.238464e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.926720 sec + 15,303,062,403 cycles # 3.103 GHz + 38,574,821,235 instructions # 2.52 insn per cycle + 4.935986004 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.423932e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.642478e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.642478e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.549029 sec - 8,692,877,164 cycles:u # 3.369 GHz (74.91%) - 9,320,788 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.92%) - 713,069,369 stalled-cycles-backend:u # 8.20% backend cycles idle (74.90%) - 24,130,962,669 instructions:u # 2.78 insn per cycle - # 0.03 stalled cycles per insn (74.91%) - 2.584633340 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.750432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.964332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.964332e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.903163 sec + 8,984,859,488 cycles # 3.089 GHz + 24,224,163,348 instructions # 2.70 insn per cycle + 2.918366508 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.864236e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.484768e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.484768e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.513255 sec - 5,092,010,127 cycles:u # 3.296 GHz (74.74%) - 8,390,120 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.67%) - 559,300,803 stalled-cycles-backend:u # 10.98% backend cycles idle (74.93%) - 11,400,119,673 instructions:u # 2.24 insn per cycle - # 0.05 stalled cycles per insn (75.15%) - 1.549217009 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2451) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.977342e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.518236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.518236e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.860423 sec + 5,396,289,064 cycles # 2.891 GHz + 11,276,510,611 instructions # 2.09 insn per cycle + 1.875091896 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.792892e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.469147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.469147e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.648151 sec + 4,836,682,110 cycles # 2.924 GHz + 10,524,586,299 instructions # 2.18 insn per cycle + 1.662467551 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.224142e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.479514e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.479514e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.587933 sec + 5,228,382,592 cycles # 2.016 GHz + 7,603,380,674 instructions # 1.45 insn per cycle + 2.604403134 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index ed6d72052a..4c3bdeb3a7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,168 +1,210 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-03_14:06:12 +DATE: 2024-03-01_02:29:18 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.850594e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.923234e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.976389e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.087161e+00 +- 3.410053e-03 ) GeV^0 -TOTAL : 0.376788 sec - 888,926,488 cycles:u # 2.214 GHz (74.23%) - 2,149,952 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.13%) - 5,354,153 stalled-cycles-backend:u # 0.60% backend cycles idle (74.13%) - 1,398,211,851 instructions:u # 1.57 insn per cycle - # 0.00 stalled cycles per insn (74.89%) - 0.430137061 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.025642e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.140563e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276898e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.529654 sec + 2,293,467,091 cycles # 2.992 GHz + 3,241,408,242 instructions # 1.41 insn per cycle + 0.836485234 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243869 -Relative difference = 3.241686434838304e-07 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.442637e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.503281e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.503281e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.472354 sec - 15,433,122,317 cycles:u # 3.426 GHz (74.96%) - 9,252,197 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 132,124,539 stalled-cycles-backend:u # 0.86% backend cycles idle (74.98%) - 40,169,887,469 instructions:u # 2.60 insn per cycle - # 0.00 stalled cycles per insn (74.97%) - 4.507625294 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.144775e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.207356e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.207356e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.994421 sec + 15,338,753,655 cycles # 3.068 GHz + 40,369,233,372 instructions # 2.63 insn per cycle + 5.002383718 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.490737e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.715926e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.715926e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.513977 sec - 8,592,144,959 cycles:u # 3.375 GHz (74.88%) - 10,115,550 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.88%) - 846,577,328 stalled-cycles-backend:u # 9.85% backend cycles idle (74.86%) - 23,518,270,139 instructions:u # 2.74 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 2.549704076 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.003325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.239627e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.239627e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.723159 sec + 8,478,435,163 cycles # 3.107 GHz + 23,253,497,249 instructions # 2.74 insn per cycle + 2.738604338 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.818297e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.281601e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.281601e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.717535 sec - 5,792,506,610 cycles:u # 3.311 GHz (74.66%) - 10,237,498 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.66%) - 739,803,153 stalled-cycles-backend:u # 12.77% backend cycles idle (74.87%) - 13,069,387,543 instructions:u # 2.26 insn per cycle - # 0.06 stalled cycles per insn (75.10%) - 1.753459731 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.181118e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.571113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.571113e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.127824 sec + 6,241,547,842 cycles # 2.925 GHz + 12,962,413,577 instructions # 2.08 insn per cycle + 2.144515260 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.322331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.729304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.729304e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.074458 sec + 5,923,278,346 cycles # 2.853 GHz + 12,242,730,346 instructions # 2.07 insn per cycle + 2.086429072 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.899734e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.116034e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.116034e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.794263 sec + 5,618,790,292 cycles # 2.007 GHz + 8,743,459,975 instructions # 1.56 insn per cycle + 2.808786612 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288064057068964 +Relative difference = 2.9292737240031234e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 5dc4677d13..c4c4bff630 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-03_14:06:34 +DATE: 2024-03-01_02:29:46 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.902087e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.056700e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.064313e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.413827 sec - 1,039,031,182 cycles:u # 2.496 GHz (73.86%) - 2,272,214 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.06%) - 5,009,677 stalled-cycles-backend:u # 0.48% backend cycles idle (75.09%) - 1,499,486,038 instructions:u # 1.44 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 0.461232645 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.473707e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045050e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.061478e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.463329 sec + 2,069,832,304 cycles # 3.002 GHz + 2,918,096,235 instructions # 1.41 insn per cycle + 0.772559551 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.637828e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.844690e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.850403e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.621274 sec - 1,699,917,764 cycles:u # 2.621 GHz (75.45%) - 2,260,625 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.35%) - 4,823,180 stalled-cycles-backend:u # 0.28% backend cycles idle (75.59%) - 2,014,674,020 instructions:u # 1.19 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 0.674327003 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.045387e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.319438e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336268e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.608947 sec + 2,562,374,732 cycles # 3.012 GHz + 3,879,371,783 instructions # 1.51 insn per cycle + 0.910123971 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.946395e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.958721e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.958721e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.584711 sec - 19,628,415,309 cycles:u # 3.501 GHz (74.97%) - 2,431,313 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) - 3,361,707,387 stalled-cycles-backend:u # 17.13% backend cycles idle (75.04%) - 57,925,297,511 instructions:u # 2.95 insn per cycle - # 0.06 stalled cycles per insn (75.04%) - 5.609734489 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.585844e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.598254e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.598254e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.359535 sec + 19,687,428,773 cycles # 3.094 GHz + 59,604,296,849 instructions # 3.03 insn per cycle + 6.365859123 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432431 -Relative difference = 4.4692302355460254e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.104957e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.156893e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.156893e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.707312 sec - 9,533,927,731 cycles:u # 3.492 GHz (74.88%) - 2,474,777 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.82%) - 2,355,567,040 stalled-cycles-backend:u # 24.71% backend cycles idle (74.96%) - 29,949,411,318 instructions:u # 3.14 insn per cycle - # 0.08 stalled cycles per insn (75.09%) - 2.736384477 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.691737e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.735631e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.735631e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.515479 sec + 10,373,655,779 cycles # 2.948 GHz + 30,676,465,519 instructions # 2.96 insn per cycle + 3.528584808 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.260863e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.283029e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.283029e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.322910 sec - 4,691,971,408 cycles:u # 3.487 GHz (74.89%) - 2,333,999 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.03%) - 1,527,580,422 stalled-cycles-backend:u # 32.56% backend cycles idle (75.03%) - 11,213,604,407 instructions:u # 2.39 insn per cycle - # 0.14 stalled cycles per insn (75.03%) - 1.349256507 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.754839e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.932602e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.932602e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.702212 sec + 4,885,421,396 cycles # 2.863 GHz + 11,020,224,832 instructions # 2.26 insn per cycle + 1.717667988 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.095884e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.117707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.117707e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.517268 sec + 4,368,757,303 cycles # 2.872 GHz + 10,296,904,442 instructions # 2.36 insn per cycle + 1.532957385 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.761348e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.875289e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.875289e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.135983 sec + 4,101,318,849 cycles # 1.917 GHz + 5,843,401,136 instructions # 1.42 insn per cycle + 2.151041040 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 4aab83e9a1..7a80a6327c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-03_14:53:26 +DATE: 2024-03-01_03:14:59 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.490310e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.018073e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.018073e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.564985 sec - 1,665,296,767 cycles:u # 2.836 GHz (74.91%) - 10,273,236 stalled-cycles-frontend:u # 0.62% frontend cycles idle (75.51%) - 268,134,122 stalled-cycles-backend:u # 16.10% backend cycles idle (75.61%) - 1,999,752,314 instructions:u # 1.20 insn per cycle - # 0.13 stalled cycles per insn (75.59%) - 0.610025344 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.634181e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.802665e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.802665e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.494713 sec + 2,059,588,733 cycles # 2.926 GHz + 3,067,379,574 instructions # 1.49 insn per cycle + 0.764554853 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.205511e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.674234e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.674234e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.267387 sec - 3,811,409,644 cycles:u # 2.921 GHz (75.15%) - 29,986,646 stalled-cycles-frontend:u # 0.79% frontend cycles idle (75.15%) - 859,820,788 stalled-cycles-backend:u # 22.56% backend cycles idle (74.88%) - 3,799,477,591 instructions:u # 1.00 insn per cycle - # 0.23 stalled cycles per insn (74.89%) - 1.329741015 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.715023e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.440232e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.440232e+06 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.824199 sec + 3,179,114,916 cycles # 2.965 GHz + 5,069,610,946 instructions # 1.59 insn per cycle + 1.133521853 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.944931e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.957297e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.957297e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.591656 sec - 19,640,576,801 cycles:u # 3.498 GHz (74.93%) - 2,543,236 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) - 3,295,328,624 stalled-cycles-backend:u # 16.78% backend cycles idle (74.95%) - 57,913,028,019 instructions:u # 2.95 insn per cycle - # 0.06 stalled cycles per insn (75.07%) - 5.617003149 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.525402e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.537809e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.537809e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.518056 sec + 19,750,480,394 cycles # 3.028 GHz + 59,611,727,500 instructions # 3.02 insn per cycle + 6.522447301 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432431 -Relative difference = 4.4692302355460254e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.103744e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.156017e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.156017e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.712375 sec - 9,544,285,539 cycles:u # 3.489 GHz (74.75%) - 2,602,426 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.90%) - 2,338,962,459 stalled-cycles-backend:u # 24.51% backend cycles idle (75.15%) - 29,996,079,828 instructions:u # 3.14 insn per cycle - # 0.08 stalled cycles per insn (75.15%) - 2.739150856 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.903232e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.949588e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.949588e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.370584 sec + 10,396,817,898 cycles # 3.081 GHz + 30,723,473,589 instructions # 2.96 insn per cycle + 3.375008450 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.255164e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.277054e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277054e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.333524 sec - 4,705,116,374 cycles:u # 3.469 GHz (74.77%) - 2,933,791 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.46%) - 1,535,647,412 stalled-cycles-backend:u # 32.64% backend cycles idle (74.76%) - 11,250,242,509 instructions:u # 2.39 insn per cycle - # 0.14 stalled cycles per insn (75.23%) - 1.360215835 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.888216e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.006946e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.006946e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.685691 sec + 4,902,930,827 cycles # 2.902 GHz + 11,066,989,869 instructions # 2.26 insn per cycle + 1.690115997 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.103682e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.126401e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.126401e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.513774 sec + 4,402,683,305 cycles # 2.901 GHz + 10,346,890,880 instructions # 2.35 insn per cycle + 1.518250177 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.798042e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.913691e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.913691e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.132010 sec + 4,131,468,761 cycles # 1.935 GHz + 5,881,941,509 instructions # 1.42 insn per cycle + 2.136586909 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 66aaaaaf83..90bf6e6455 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-03_14:06:59 +DATE: 2024-03-01_02:30:15 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.876910e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.024569e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.032530e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.397674 sec - 1,090,557,817 cycles:u # 2.613 GHz (73.44%) - 2,354,638 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.22%) - 4,877,535 stalled-cycles-backend:u # 0.45% backend cycles idle (74.90%) - 1,522,275,397 instructions:u # 1.40 insn per cycle - # 0.00 stalled cycles per insn (75.29%) - 0.445131973 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.404765e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.032804e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.048930e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.465265 sec + 2,029,896,808 cycles # 2.980 GHz + 2,854,741,238 instructions # 1.41 insn per cycle + 0.763772288 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.593750e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.812455e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.818105e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.620125 sec - 1,698,536,250 cycles:u # 2.634 GHz (75.30%) - 2,170,927 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.20%) - 5,201,162 stalled-cycles-backend:u # 0.31% backend cycles idle (75.54%) - 1,984,023,513 instructions:u # 1.17 insn per cycle - # 0.00 stalled cycles per insn (75.44%) - 0.673847841 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.033730e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.306062e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.322624e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.607194 sec + 2,545,937,909 cycles # 2.996 GHz + 3,826,405,631 instructions # 1.50 insn per cycle + 0.909330494 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.917218e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.929307e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.929307e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.639988 sec - 19,807,518,823 cycles:u # 3.498 GHz (74.99%) - 2,620,483 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 4,399,963,860 stalled-cycles-backend:u # 22.21% backend cycles idle (74.99%) - 57,755,607,483 instructions:u # 2.92 insn per cycle - # 0.08 stalled cycles per insn (74.99%) - 5.664770908 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.602792e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.615496e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.615496e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.317260 sec + 19,445,883,412 cycles # 3.076 GHz + 58,795,735,881 instructions # 3.02 insn per cycle + 6.323702590 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432431 -Relative difference = 4.4692302355460254e-07 +Avg ME (F77/C++) = 1.4131213684432433 +Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.014022e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.064686e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.064686e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.747663 sec - 9,670,552,206 cycles:u # 3.490 GHz (74.88%) - 2,329,087 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.88%) - 2,099,453,756 stalled-cycles-backend:u # 21.71% backend cycles idle (74.90%) - 30,377,591,174 instructions:u # 3.14 insn per cycle - # 0.07 stalled cycles per insn (75.04%) - 2.774330523 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.903926e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.950247e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.950247e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.363533 sec + 10,256,448,579 cycles # 3.046 GHz + 30,347,165,405 instructions # 2.96 insn per cycle + 3.377280590 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.199979e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.219997e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.219997e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.388695 sec - 4,910,323,743 cycles:u # 3.479 GHz (74.97%) - 2,629,932 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.07%) - 1,693,812,265 stalled-cycles-backend:u # 34.49% backend cycles idle (75.07%) - 11,669,214,722 instructions:u # 2.38 insn per cycle - # 0.15 stalled cycles per insn (75.07%) - 1.415480675 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4471) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.598787e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.768674e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.768674e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.728674 sec + 5,043,692,461 cycles # 2.911 GHz + 11,484,727,811 instructions # 2.28 insn per cycle + 1.738921569 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.033952e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.054066e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054066e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.607009 sec + 4,642,681,786 cycles # 2.882 GHz + 10,842,961,046 instructions # 2.34 insn per cycle + 1.618440779 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.765124e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.875111e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.875111e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.134046 sec + 4,109,311,958 cycles # 1.922 GHz + 6,106,472,133 instructions # 1.49 insn per cycle + 2.145705149 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 53e89252d5..af4f474b65 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-03_14:07:23 +DATE: 2024-03-01_02:30:44 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.203921e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.929807e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.023161e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.415474e+04 +- 1.288238e+04 ) GeV^-2 -TOTAL : 0.328270 sec - 812,063,986 cycles:u # 2.328 GHz (73.46%) - 2,362,031 stalled-cycles-frontend:u # 0.29% frontend cycles idle (75.14%) - 4,822,306 stalled-cycles-backend:u # 0.59% backend cycles idle (75.14%) - 1,357,340,463 instructions:u # 1.67 insn per cycle - # 0.00 stalled cycles per insn (75.81%) - 0.372667349 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.308616e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.230427e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.340211e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.445727 sec + 2,001,558,197 cycles # 3.000 GHz + 2,820,746,449 instructions # 1.41 insn per cycle + 0.736568143 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.361988e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.630320e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.636311e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.619620e+05 +- 1.611328e+05 ) GeV^-2 -TOTAL : 0.443221 sec - 1,173,104,015 cycles:u # 2.521 GHz (74.19%) - 2,315,115 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.91%) - 4,226,657 stalled-cycles-backend:u # 0.36% backend cycles idle (74.98%) - 1,508,758,815 instructions:u # 1.29 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 0.490579299 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.061859e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.424190e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.524056e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.500107 sec + 2,158,124,631 cycles # 2.977 GHz + 3,092,829,809 instructions # 1.43 insn per cycle + 0.784432881 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669530965212 -Relative difference = 0.0005401804983001964 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.266922e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.282433e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.282433e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.036678 sec - 17,710,352,240 cycles:u # 3.501 GHz (74.98%) - 2,410,483 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 2,136,534,101 stalled-cycles-backend:u # 12.06% backend cycles idle (75.02%) - 55,283,453,729 instructions:u # 3.12 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 5.061421346 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.674607e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.688116e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.688116e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.146873 sec + 19,061,096,774 cycles # 3.099 GHz + 58,958,014,215 instructions # 3.09 insn per cycle + 6.153306662 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858051842916 +Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.088894e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.106292e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.106292e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.525878 sec - 5,391,591,095 cycles:u # 3.482 GHz (74.72%) - 2,226,906 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.83%) - 1,682,078,164 stalled-cycles-backend:u # 31.20% backend cycles idle (75.06%) - 16,171,756,161 instructions:u # 3.00 insn per cycle - # 0.10 stalled cycles per insn (75.21%) - 1.551958544 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.781065e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.932207e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.932207e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.886682 sec + 5,850,782,122 cycles # 3.096 GHz + 16,695,269,066 instructions # 2.85 insn per cycle + 1.898716135 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857118325333 -Relative difference = 2.039421953066926e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412987e+00 +Avg ME (F77/C++) = 1.4129865669244737 +Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.355662e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.435482e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.435482e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.716733 sec - 2,558,521,208 cycles:u # 3.462 GHz (74.93%) - 2,064,335 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.11%) - 789,090,274 stalled-cycles-backend:u # 30.84% backend cycles idle (75.11%) - 6,104,077,165 instructions:u # 2.39 insn per cycle - # 0.13 stalled cycles per insn (75.12%) - 0.742553521 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.892145e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.960485e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.960485e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.886334 sec + 2,581,461,055 cycles # 2.900 GHz + 5,980,838,355 instructions # 2.32 insn per cycle + 0.901108038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.036523e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.118274e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.118274e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.825324 sec + 2,349,134,788 cycles # 2.832 GHz + 5,603,128,082 instructions # 2.39 insn per cycle + 0.837493797 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.468368e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.511305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.511305e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.138775 sec + 2,054,810,359 cycles # 1.798 GHz + 3,334,038,485 instructions # 1.62 insn per cycle + 1.149410848 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164033579249 +Relative difference = 2.85398258307829e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 56198cb285..f62f4c8cdf 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-03_14:53:51 +DATE: 2024-03-01_03:15:29 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.278258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.631600e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.631600e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.755516e+02 +- 2.671055e+02 ) GeV^-2 -TOTAL : 0.498007 sec - 1,466,960,248 cycles:u # 2.811 GHz (74.07%) - 11,056,858 stalled-cycles-frontend:u # 0.75% frontend cycles idle (74.97%) - 48,152,138 stalled-cycles-backend:u # 3.28% backend cycles idle (74.45%) - 1,977,966,925 instructions:u # 1.35 insn per cycle - # 0.02 stalled cycles per insn (74.03%) - 0.544490496 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.995753e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.112595e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.112595e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 +TOTAL : 0.451281 sec + 1,977,131,537 cycles # 2.986 GHz + 2,910,150,577 instructions # 1.47 insn per cycle + 0.718929629 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.110902e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.469331e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.469331e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.855934e+03 +- 1.791981e+03 ) GeV^-2 -TOTAL : 1.101783 sec - 3,362,274,754 cycles:u # 2.971 GHz (74.60%) - 29,414,665 stalled-cycles-frontend:u # 0.87% frontend cycles idle (74.88%) - 932,389,513 stalled-cycles-backend:u # 27.73% backend cycles idle (75.31%) - 3,400,944,853 instructions:u # 1.01 insn per cycle - # 0.27 stalled cycles per insn (75.29%) - 1.154441526 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.708417e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.567455e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.567455e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 +TOTAL : 0.637857 sec + 2,608,085,808 cycles # 2.999 GHz + 3,961,129,191 instructions # 1.52 insn per cycle + 0.928114705 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669530965212 -Relative difference = 0.0005401804983001964 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.270228e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.285880e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.285880e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.033945 sec - 17,685,688,410 cycles:u # 3.497 GHz (75.01%) - 2,445,054 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 2,136,376,004 stalled-cycles-backend:u # 12.08% backend cycles idle (75.01%) - 55,264,318,595 instructions:u # 3.12 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 5.059502444 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.667614e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.681311e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.681311e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.166590 sec + 19,068,958,964 cycles # 3.091 GHz + 58,962,429,433 instructions # 3.09 insn per cycle + 6.170849448 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858051842916 +Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.080447e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.097594e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.097594e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.540979 sec - 5,440,621,359 cycles:u # 3.479 GHz (74.94%) - 2,245,671 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.94%) - 1,706,100,678 stalled-cycles-backend:u # 31.36% backend cycles idle (74.94%) - 16,203,790,784 instructions:u # 2.98 insn per cycle - # 0.11 stalled cycles per insn (74.94%) - 1.567488020 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.742153e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.893438e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.893438e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.898339 sec + 5,876,062,473 cycles # 3.090 GHz + 16,741,995,731 instructions # 2.85 insn per cycle + 1.902713080 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857118325333 -Relative difference = 2.039421953066926e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412987e+00 +Avg ME (F77/C++) = 1.4129865669244737 +Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.368533e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.449245e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.449245e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.715861 sec - 2,551,325,043 cycles:u # 3.453 GHz (74.92%) - 2,001,393 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.10%) - 794,495,145 stalled-cycles-backend:u # 31.14% backend cycles idle (75.10%) - 6,130,305,492 instructions:u # 2.40 insn per cycle - # 0.13 stalled cycles per insn (75.11%) - 0.742317031 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.880787e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949754e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949754e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.895765 sec + 2,600,620,319 cycles # 2.891 GHz + 6,016,590,564 instructions # 2.31 insn per cycle + 0.900189489 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.084629e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.167676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.167676e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.810420 sec + 2,363,958,510 cycles # 2.904 GHz + 5,639,045,986 instructions # 2.39 insn per cycle + 0.814799834 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.603454e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.652417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.652417e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.048212 sec + 2,071,251,869 cycles # 1.970 GHz + 3,374,799,702 instructions # 1.63 insn per cycle + 1.052574627 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164033579249 +Relative difference = 2.85398258307829e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 6560a5a7c4..b43a9401e8 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-03_14:07:45 +DATE: 2024-03-01_02:31:09 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.196681e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.928788e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.036655e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.415474e+04 +- 1.288238e+04 ) GeV^-2 -TOTAL : 0.329154 sec - 835,208,762 cycles:u # 2.395 GHz (73.30%) - 2,288,881 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.84%) - 5,214,532 stalled-cycles-backend:u # 0.62% backend cycles idle (74.91%) - 1,324,908,753 instructions:u # 1.59 insn per cycle - # 0.00 stalled cycles per insn (75.74%) - 0.373210994 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.359219e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.312667e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.422625e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.446885 sec + 1,972,174,797 cycles # 2.962 GHz + 2,746,314,290 instructions # 1.39 insn per cycle + 0.738224654 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.426810e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.693888e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.700335e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.619620e+05 +- 1.611328e+05 ) GeV^-2 -TOTAL : 0.443145 sec - 1,181,226,145 cycles:u # 2.537 GHz (75.19%) - 2,204,442 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.91%) - 5,622,285 stalled-cycles-backend:u # 0.48% backend cycles idle (74.91%) - 1,584,875,011 instructions:u # 1.34 insn per cycle - # 0.00 stalled cycles per insn (75.83%) - 0.490414289 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.060800e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.419962e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.520064e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.497273 sec + 2,176,246,033 cycles # 3.004 GHz + 3,133,180,341 instructions # 1.44 insn per cycle + 0.782102946 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669531526541 -Relative difference = 0.0005401805380429868 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.251763e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.267144e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.267144e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.059826 sec - 17,789,469,969 cycles:u # 3.501 GHz (74.98%) - 2,674,859 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.98%) - 3,105,019,101 stalled-cycles-backend:u # 17.45% backend cycles idle (74.97%) - 55,027,375,905 instructions:u # 3.09 insn per cycle - # 0.06 stalled cycles per insn (74.97%) - 5.084642260 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.676079e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.689805e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.689805e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.143350 sec + 18,995,848,931 cycles # 3.090 GHz + 58,700,265,502 instructions # 3.09 insn per cycle + 6.150073952 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129858051842916 +Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.124434e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.143020e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.143020e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.478176 sec - 5,232,431,526 cycles:u # 3.487 GHz (75.03%) - 2,092,634 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.95%) - 1,443,047,898 stalled-cycles-backend:u # 27.58% backend cycles idle (74.95%) - 16,249,178,116 instructions:u # 3.11 insn per cycle - # 0.09 stalled cycles per insn (74.95%) - 1.504828895 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.180884e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.346917e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.346917e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.804269 sec + 5,584,642,506 cycles # 3.088 GHz + 16,510,962,038 instructions # 2.96 insn per cycle + 1.819572816 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857712652836 -Relative difference = 1.618803841657786e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.412987e+00 +Avg ME (F77/C++) = 1.4129865669244737 +Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.127769e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.192769e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.192769e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.791024 sec - 2,813,900,368 cycles:u # 3.459 GHz (74.45%) - 2,924,467 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.60%) - 844,743,001 stalled-cycles-backend:u # 30.02% backend cycles idle (75.08%) - 6,732,540,107 instructions:u # 2.39 insn per cycle - # 0.13 stalled cycles per insn (75.43%) - 0.817224988 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5412) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.634306e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.685973e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.685973e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 1.022630 sec + 2,975,513,176 cycles # 2.898 GHz + 6,634,498,276 instructions # 2.23 insn per cycle + 1.034400565 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.769784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.829611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.829611e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.945795 sec + 2,752,522,160 cycles # 2.898 GHz + 6,256,039,450 instructions # 2.27 insn per cycle + 0.961442115 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133161655815059 +Relative difference = 1.1715816267550621e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.392018e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.430701e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.430701e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.200320 sec + 2,230,572,619 cycles # 1.852 GHz + 3,698,329,997 instructions # 1.66 insn per cycle + 1.213663484 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2378) (512y: 29) (512z: 3963) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164033579249 +Relative difference = 2.85398258307829e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 266ca660a0..568d6c4513 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-03_14:08:06 +DATE: 2024-03-01_02:31:34 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.902026e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.051350e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.057968e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.394057 sec - 1,072,493,406 cycles:u # 2.584 GHz (73.70%) - 2,358,166 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.91%) - 5,190,578 stalled-cycles-backend:u # 0.48% backend cycles idle (75.17%) - 1,527,179,402 instructions:u # 1.42 insn per cycle - # 0.00 stalled cycles per insn (75.92%) - 0.441197229 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.426575e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.039569e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055629e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.463709 sec + 2,071,639,040 cycles # 3.004 GHz + 2,941,031,538 instructions # 1.42 insn per cycle + 0.764842159 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.641496e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.847092e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.852228e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.621271 sec - 1,710,114,837 cycles:u # 2.637 GHz (74.54%) - 2,243,098 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.24%) - 5,004,865 stalled-cycles-backend:u # 0.29% backend cycles idle (75.41%) - 2,017,082,158 instructions:u # 1.18 insn per cycle - # 0.00 stalled cycles per insn (75.36%) - 0.674174885 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.035948e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309187e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325703e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.608855 sec + 2,552,084,280 cycles # 3.004 GHz + 3,794,047,088 instructions # 1.49 insn per cycle + 0.909216297 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569483 -Relative difference = 4.4188898885662695e-07 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.889373e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.901249e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.901249e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.694274 sec - 19,995,566,723 cycles:u # 3.498 GHz (74.95%) - 2,721,102 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 4,191,817,537 stalled-cycles-backend:u # 20.96% backend cycles idle (74.96%) - 59,211,742,429 instructions:u # 2.96 insn per cycle - # 0.07 stalled cycles per insn (75.00%) - 5.719077736 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.546543e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.558753e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.558753e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.456566 sec + 20,000,355,725 cycles # 3.096 GHz + 60,532,425,335 instructions # 3.03 insn per cycle + 6.462989015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.112243e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.164517e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.164517e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.704115 sec - 9,537,760,029 cycles:u # 3.498 GHz (74.86%) - 2,331,362 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.06%) - 2,352,954,265 stalled-cycles-backend:u # 24.67% backend cycles idle (75.06%) - 29,754,943,887 instructions:u # 3.12 insn per cycle - # 0.08 stalled cycles per insn (75.07%) - 2.730308483 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.015629e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.062224e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.062224e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.288178 sec + 10,191,043,016 cycles # 3.096 GHz + 30,384,591,666 instructions # 2.98 insn per cycle + 3.302408299 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.262207e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.284404e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.284404e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.321482 sec - 4,683,498,847 cycles:u # 3.484 GHz (74.95%) - 2,266,589 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) - 1,546,576,102 stalled-cycles-backend:u # 33.02% backend cycles idle (75.01%) - 11,204,209,244 instructions:u # 2.39 insn per cycle - # 0.14 stalled cycles per insn (75.01%) - 1.347795806 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4563) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.844182e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.002719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.002719e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.686926 sec + 4,874,678,301 cycles # 2.883 GHz + 10,979,160,826 instructions # 2.25 insn per cycle + 1.698730583 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.132241e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155783e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.155783e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.469271 sec + 4,278,421,569 cycles # 2.904 GHz + 10,248,685,624 instructions # 2.40 insn per cycle + 1.480280367 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.587751e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.694540e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.694540e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.183850 sec + 4,204,822,902 cycles # 1.923 GHz + 6,044,506,630 instructions # 1.44 insn per cycle + 2.192719745 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213786174055 +Relative difference = 4.3972324717191576e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index ed7f390279..2001d2a062 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-03_14:08:31 +DATE: 2024-03-01_02:32:03 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.898925e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.041591e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.050657e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.872208e+03 +- 2.725298e+03 ) GeV^-2 -TOTAL : 0.392553 sec - 1,067,884,395 cycles:u # 2.579 GHz (73.45%) - 2,397,019 stalled-cycles-frontend:u # 0.22% frontend cycles idle (73.29%) - 5,343,057 stalled-cycles-backend:u # 0.50% backend cycles idle (74.99%) - 1,520,090,207 instructions:u # 1.42 insn per cycle - # 0.00 stalled cycles per insn (75.71%) - 0.437411352 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.409979e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.033107e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.049247e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.461655 sec + 2,079,301,655 cycles # 3.013 GHz + 2,945,288,445 instructions # 1.42 insn per cycle + 0.761228896 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.627682e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.835274e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.840980e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.805651e+03 +- 1.746055e+03 ) GeV^-2 -TOTAL : 0.615989 sec - 1,701,218,541 cycles:u # 2.643 GHz (75.08%) - 2,280,823 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.16%) - 5,233,329 stalled-cycles-backend:u # 0.31% backend cycles idle (75.52%) - 1,993,728,288 instructions:u # 1.17 insn per cycle - # 0.00 stalled cycles per insn (75.43%) - 0.670239749 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.037338e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.304237e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318241e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.603998 sec + 2,550,056,991 cycles # 3.016 GHz + 3,770,712,997 instructions # 1.48 insn per cycle + 0.905342631 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569483 -Relative difference = 4.4188898885662695e-07 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.902767e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.914803e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.914803e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.668045 sec - 19,920,122,262 cycles:u # 3.500 GHz (74.98%) - 2,563,039 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 4,140,843,564 stalled-cycles-backend:u # 20.79% backend cycles idle (74.98%) - 58,705,733,304 instructions:u # 2.95 insn per cycle - # 0.07 stalled cycles per insn (74.98%) - 5.693347934 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.536387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.548597e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.548597e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.482109 sec + 19,897,203,281 cycles # 3.068 GHz + 59,934,079,759 instructions # 3.01 insn per cycle + 6.488470935 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.199590e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.254972e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.254972e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.666534 sec - 9,395,499,000 cycles:u # 3.494 GHz (74.98%) - 2,544,995 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.01%) - 2,073,809,606 stalled-cycles-backend:u # 22.07% backend cycles idle (75.01%) - 30,156,482,499 instructions:u # 3.21 insn per cycle - # 0.07 stalled cycles per insn (75.02%) - 2.693096800 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.079933e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.127366e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.127366e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.246582 sec + 10,068,513,741 cycles # 3.097 GHz + 30,097,905,174 instructions # 2.99 insn per cycle + 3.264343936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.217824e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.238455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.238455e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.368644 sec - 4,832,692,381 cycles:u # 3.473 GHz (74.71%) - 2,500,289 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.75%) - 1,555,804,345 stalled-cycles-backend:u # 32.19% backend cycles idle (74.93%) - 11,688,372,170 instructions:u # 2.42 insn per cycle - # 0.13 stalled cycles per insn (75.18%) - 1.395801996 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4667) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.599229e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.768469e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.768469e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.728964 sec + 5,016,079,762 cycles # 2.895 GHz + 11,483,054,886 instructions # 2.29 insn per cycle + 1.742427809 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.051243e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.071758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071758e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.580395 sec + 4,590,869,899 cycles # 2.898 GHz + 10,811,034,467 instructions # 2.35 insn per cycle + 1.596114627 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.586932e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.694563e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.694563e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.184061 sec + 4,216,157,602 cycles # 1.927 GHz + 6,273,944,868 instructions # 1.49 insn per cycle + 2.195028764 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213786174055 +Relative difference = 4.3972324717191576e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index c60f4bd576..c4f627d4b9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:08:55 +DATE: 2024-03-01_02:32:32 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.443213e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.592458e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.593832e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.658516 sec - 1,915,656,349 cycles:u # 2.930 GHz (74.58%) - 2,322,198 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.44%) - 5,431,561 stalled-cycles-backend:u # 0.28% backend cycles idle (74.96%) - 2,119,719,157 instructions:u # 1.11 insn per cycle - # 0.00 stalled cycles per insn (76.05%) - 0.704615889 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.456101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.491439e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.526891 sec + 2,312,216,646 cycles # 3.007 GHz + 3,538,385,257 instructions # 1.53 insn per cycle + 0.841955777 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.243645e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.246133e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.246191e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 7.700449 sec - 26,524,218,435 cycles:u # 3.432 GHz (74.95%) - 3,249,930 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 4,782,652 stalled-cycles-backend:u # 0.02% backend cycles idle (75.06%) - 21,094,095,538 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 7.758120507 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.122556e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.158071e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.159487e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.037875 sec + 10,086,152,870 cycles # 3.059 GHz + 22,511,661,776 instructions # 2.23 insn per cycle + 3.352868148 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.232576e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.233464e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.233464e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.363302 sec - 25,824,388,635 cycles:u # 3.501 GHz (74.95%) - 1,475,547 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 3,692,253,610 stalled-cycles-backend:u # 14.30% backend cycles idle (74.96%) - 81,773,513,781 instructions:u # 3.17 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 7.388142336 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.962967e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.963888e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.963888e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.365178 sec + 25,629,682,297 cycles # 3.063 GHz + 78,935,463,104 instructions # 3.08 insn per cycle + 8.371779038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.049918e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.054493e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.054493e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.255362 sec - 11,447,299,246 cycles:u # 3.492 GHz (74.89%) - 1,095,573 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 1,370,623,003 stalled-cycles-backend:u # 11.97% backend cycles idle (75.07%) - 39,243,650,589 instructions:u # 3.43 insn per cycle - # 0.03 stalled cycles per insn (75.11%) - 3.281919712 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.775994e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.779313e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.779313e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.352554 sec + 12,920,825,541 cycles # 2.966 GHz + 39,280,019,197 instructions # 3.04 insn per cycle + 4.370436126 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.209079e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.211700e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.211700e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.364169 sec - 4,834,989,119 cycles:u # 3.486 GHz (74.79%) - 724,221 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.20%) - 559,585,514 stalled-cycles-backend:u # 11.57% backend cycles idle (75.20%) - 13,846,879,661 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (74.92%) - 1.390361830 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.587371e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.605210e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.605210e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.920439 sec + 5,577,220,412 cycles # 2.899 GHz + 13,686,699,383 instructions # 2.45 insn per cycle + 1.933532640 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.660129e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.682450e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.682450e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.708010 sec + 4,898,677,790 cycles # 2.863 GHz + 12,341,670,637 instructions # 2.52 insn per cycle + 1.722166284 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.531084e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.544719e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.544719e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.187284 sec + 4,109,191,778 cycles # 1.875 GHz + 6,335,550,253 instructions # 1.54 insn per cycle + 2.200752564 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index fc9bbc7387..8d1778e673 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:54:42 +DATE: 2024-03-01_03:16:28 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.380234e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.523976e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.523976e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.660049 sec - 1,987,141,382 cycles:u # 2.923 GHz (74.61%) - 2,974,543 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.40%) - 34,864,776 stalled-cycles-backend:u # 1.75% backend cycles idle (74.77%) - 2,225,515,652 instructions:u # 1.12 insn per cycle - # 0.02 stalled cycles per insn (74.91%) - 0.705064086 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.142985e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.469804e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.469804e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.511155 sec + 2,228,194,908 cycles # 3.016 GHz + 3,541,287,827 instructions # 1.59 insn per cycle + 0.799045956 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.209603e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.244411e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.244411e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.550769 sec - 29,278,144,313 cycles:u # 3.408 GHz (75.04%) - 22,139,444 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.05%) - 1,142,582,420 stalled-cycles-backend:u # 3.90% backend cycles idle (75.04%) - 23,517,292,837 instructions:u # 0.80 insn per cycle - # 0.05 stalled cycles per insn (75.05%) - 8.613741102 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.621948e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.093950e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.093950e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.305480 sec + 10,998,775,521 cycles # 3.077 GHz + 24,493,841,360 instructions # 2.23 insn per cycle + 3.633710964 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.203000e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.203872e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.203872e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.455869 sec - 26,168,247,565 cycles:u # 3.499 GHz (74.97%) - 34,946,009 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.97%) - 3,763,223,846 stalled-cycles-backend:u # 14.38% backend cycles idle (74.97%) - 81,775,845,502 instructions:u # 3.13 insn per cycle - # 0.05 stalled cycles per insn (74.98%) - 7.481363596 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.956691e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.957671e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.957671e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.395628 sec + 25,661,453,890 cycles # 3.059 GHz + 78,946,626,848 instructions # 3.08 insn per cycle + 8.400144517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.060165e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.064745e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.064745e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.252571 sec - 11,437,798,335 cycles:u # 3.491 GHz (74.87%) - 1,096,110 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 1,362,677,177 stalled-cycles-backend:u # 11.91% backend cycles idle (75.08%) - 39,244,727,057 instructions:u # 3.43 insn per cycle - # 0.03 stalled cycles per insn (75.10%) - 3.279634839 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.779486e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.783121e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.783121e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.352704 sec + 12,939,532,043 cycles # 2.970 GHz + 39,292,271,047 instructions # 3.04 insn per cycle + 4.357352756 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.209381e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.212036e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.212036e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.367520 sec - 4,809,725,861 cycles:u # 3.458 GHz (74.73%) - 786,688 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.80%) - 534,931,736 stalled-cycles-backend:u # 11.12% backend cycles idle (75.06%) - 13,801,628,982 instructions:u # 2.87 insn per cycle - # 0.04 stalled cycles per insn (75.27%) - 1.394470657 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.560149e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.578951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.578951e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.929060 sec + 5,589,750,479 cycles # 2.892 GHz + 13,696,577,373 instructions # 2.45 insn per cycle + 1.933630865 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.749338e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.772565e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.772565e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.695619 sec + 4,910,055,408 cycles # 2.889 GHz + 12,351,492,799 instructions # 2.52 insn per cycle + 1.700097015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.621116e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.636094e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.636094e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.165843 sec + 4,123,850,554 cycles # 1.901 GHz + 6,345,407,560 instructions # 1.54 insn per cycle + 2.170297070 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index c215267a23..597fd5665a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,182 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_15:04:56 +DATE: 2024-03-01_03:28:08 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.385233e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.605323e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.606526e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.502974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.532224e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534544e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.649809 sec - 1,951,525,535 cycles:u # 2.906 GHz (74.92%) - 2,497,767 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.10%) - 34,093,522 stalled-cycles-backend:u # 1.75% backend cycles idle (75.36%) - 2,162,495,237 instructions:u # 1.11 insn per cycle - # 0.02 stalled cycles per insn (75.19%) - 0.693235115 seconds time elapsed +TOTAL : 0.505991 sec + 2,242,092,583 cycles # 3.014 GHz + 3,466,791,908 instructions # 1.55 insn per cycle + 0.811853126 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.243555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.246594e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.246652e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.137461e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.171030e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.172456e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.390908 sec - 28,908,247,505 cycles:u # 3.432 GHz (74.92%) - 11,922,008 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.98%) - 1,149,312,271 stalled-cycles-backend:u # 3.98% backend cycles idle (75.03%) - 22,694,060,209 instructions:u # 0.79 insn per cycle - # 0.05 stalled cycles per insn (75.03%) - 8.446085700 seconds time elapsed +TOTAL : 3.124130 sec + 10,356,034,147 cycles # 3.069 GHz + 23,417,816,833 instructions # 2.26 insn per cycle + 3.433693053 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.203553e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.204441e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.204441e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.957351e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.958278e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.958278e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.450336 sec - 26,160,134,176 cycles:u # 3.501 GHz (74.95%) - 35,129,020 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.92%) - 3,765,184,530 stalled-cycles-backend:u # 14.39% backend cycles idle (74.97%) - 81,743,104,429 instructions:u # 3.12 insn per cycle - # 0.05 stalled cycles per insn (75.06%) - 7.474754501 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.389537 sec + 25,646,805,438 cycles # 3.056 GHz + 78,935,262,340 instructions # 3.08 insn per cycle + 8.393631651 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.036858e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.041413e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.041413e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.762997e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.766514e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.766514e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.263765 sec - 11,479,314,951 cycles:u # 3.493 GHz (74.93%) - 1,078,236 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.93%) - 1,374,719,385 stalled-cycles-backend:u # 11.98% backend cycles idle (74.93%) - 39,301,165,601 instructions:u # 3.42 insn per cycle - # 0.03 stalled cycles per insn (74.95%) - 3.288037777 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.369422 sec + 12,916,153,129 cycles # 2.954 GHz + 39,278,867,860 instructions # 3.04 insn per cycle + 4.373667823 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.210717e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.213341e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.213341e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.528032e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.546362e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.546362e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.362358 sec - 4,821,236,803 cycles:u # 3.482 GHz (74.69%) - 1,484,452 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.16%) - 494,259,401 stalled-cycles-backend:u # 10.25% backend cycles idle (75.16%) - 13,804,004,919 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (75.16%) - 1.386492812 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +TOTAL : 1.933878 sec + 5,580,678,683 cycles # 2.881 GHz + 13,684,529,284 instructions # 2.45 insn per cycle + 1.937965494 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.723484e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.746463e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.746463e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.697628 sec + 4,903,453,092 cycles # 2.882 GHz + 12,338,806,795 instructions # 2.52 insn per cycle + 1.701856837 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.314965e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.328200e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.328200e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.253262 sec + 4,111,107,725 cycles # 1.822 GHz + 6,332,329,650 instructions # 1.54 insn per cycle + 2.257544828 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index fd095032ba..de32359ede 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,187 +1,227 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_15:01:12 +DATE: 2024-03-01_03:21:36 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.448083e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.580114e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.581811e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.648683 sec - 1,955,873,857 cycles:u # 2.904 GHz (75.51%) - 3,004,987 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.06%) - 33,884,000 stalled-cycles-backend:u # 1.73% backend cycles idle (75.16%) - 2,161,496,629 instructions:u # 1.11 insn per cycle - # 0.02 stalled cycles per insn (75.54%) - 0.692984705 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.198300e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.499375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.501597e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.508517 sec + 2,246,531,629 cycles # 3.011 GHz + 3,559,465,442 instructions # 1.58 insn per cycle + 0.806328345 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.209371e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.241273e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.241330e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.510530 sec - 29,321,272,330 cycles:u # 3.430 GHz (75.00%) - 22,499,031 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.01%) - 1,140,594,084 stalled-cycles-backend:u # 3.89% backend cycles idle (75.03%) - 23,505,210,458 instructions:u # 0.80 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 8.565646128 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.741268e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.175443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.176848e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.195111 sec + 10,565,694,760 cycles # 3.061 GHz + 24,272,327,456 instructions # 2.30 insn per cycle + 3.508790742 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.196828e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.197708e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.197708e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.473100 sec - 26,252,974,565 cycles:u # 3.503 GHz (74.96%) - 42,014,245 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.00%) - 3,761,392,018 stalled-cycles-backend:u # 14.33% backend cycles idle (75.03%) - 81,736,635,956 instructions:u # 3.11 insn per cycle - # 0.05 stalled cycles per insn (75.03%) - 7.497564777 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.950947e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.951893e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.951893e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.415718 sec + 25,630,796,247 cycles # 3.044 GHz + 78,935,144,677 instructions # 3.08 insn per cycle + 8.419920398 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.059285e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.063808e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.063808e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.249304 sec - 11,441,253,986 cycles:u # 3.497 GHz (74.91%) - 1,103,861 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 1,360,429,247 stalled-cycles-backend:u # 11.89% backend cycles idle (75.06%) - 39,250,559,070 instructions:u # 3.43 insn per cycle - # 0.03 stalled cycles per insn (75.06%) - 3.273415731 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.749651e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.752979e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.752979e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.383944 sec + 12,941,364,841 cycles # 2.950 GHz + 39,279,009,350 instructions # 3.04 insn per cycle + 4.388336169 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.211457e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.214083e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.214083e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.361522 sec - 4,818,596,084 cycles:u # 3.482 GHz (74.77%) - 724,451 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.01%) - 509,043,737 stalled-cycles-backend:u # 10.56% backend cycles idle (75.15%) - 13,808,567,781 instructions:u # 2.87 insn per cycle - # 0.04 stalled cycles per insn (75.15%) - 1.385716850 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.444820e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.462277e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.462277e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.951803 sec + 5,576,482,664 cycles # 2.852 GHz + 13,685,505,947 instructions # 2.45 insn per cycle + 1.956019187 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.751887e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.775334e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.775334e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.690955 sec + 4,892,330,509 cycles # 2.888 GHz + 12,340,572,549 instructions # 2.52 insn per cycle + 1.695111197 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.643060e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.657306e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.657306e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.155567 sec + 4,105,793,778 cycles # 1.902 GHz + 6,333,858,387 instructions # 1.54 insn per cycle + 2.159935327 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index b4e3b36de1..836b2fd223 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:09:31 +DATE: 2024-03-01_02:33:08 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.386128e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.441368e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.441866e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.520173 sec - 1,518,418,478 cycles:u # 2.822 GHz (74.19%) - 2,284,692 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.45%) - 5,557,571 stalled-cycles-backend:u # 0.37% backend cycles idle (74.52%) - 1,877,465,075 instructions:u # 1.24 insn per cycle - # 0.00 stalled cycles per insn (74.75%) - 0.576564042 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.456815e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489621e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.492178e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.523446 sec + 2,259,779,898 cycles # 2.994 GHz + 3,514,783,609 instructions # 1.56 insn per cycle + 0.830655921 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.739084e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.744120e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.744244e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 6.344064 sec - 21,795,921,727 cycles:u # 3.421 GHz (74.89%) - 3,060,139 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.93%) - 5,868,247 stalled-cycles-backend:u # 0.03% backend cycles idle (75.02%) - 17,480,500,624 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 6.398703650 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.127813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.161921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.163304e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.027147 sec + 10,102,095,677 cycles # 3.066 GHz + 22,774,733,235 instructions # 2.25 insn per cycle + 3.352533111 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.227750e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.228660e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.228660e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.369278 sec - 25,879,528,720 cycles:u # 3.501 GHz (75.00%) - 6,710,952 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.00%) - 3,434,328,529 stalled-cycles-backend:u # 13.27% backend cycles idle (75.00%) - 81,766,955,675 instructions:u # 3.16 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 7.393999483 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.968945e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.969930e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.969930e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.342362 sec + 25,562,894,530 cycles # 3.064 GHz + 78,707,498,900 instructions # 3.08 insn per cycle + 8.350709191 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.031721e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.036221e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.036221e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.266876 sec - 11,494,978,612 cycles:u # 3.494 GHz (74.95%) - 1,901,440 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) - 1,517,805,835 stalled-cycles-backend:u # 13.20% backend cycles idle (74.95%) - 39,254,997,853 instructions:u # 3.41 insn per cycle - # 0.04 stalled cycles per insn (74.96%) - 3.293275469 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.758058e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.761397e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.761397e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.374701 sec + 12,919,245,066 cycles # 2.951 GHz + 39,226,355,054 instructions # 3.04 insn per cycle + 4.387657418 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.207772e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.210381e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.210381e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.365420 sec - 4,828,463,830 cycles:u # 3.478 GHz (74.71%) - 750,092 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.69%) - 598,745,360 stalled-cycles-backend:u # 12.40% backend cycles idle (74.93%) - 13,824,866,165 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (75.18%) - 1.391810341 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11030) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.289947e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.307265e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.307265e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.987975 sec + 5,629,143,308 cycles # 2.825 GHz + 13,800,788,871 instructions # 2.45 insn per cycle + 1.999251955 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.607973e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.629961e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.629961e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.716692 sec + 4,942,228,477 cycles # 2.873 GHz + 12,466,581,724 instructions # 2.52 insn per cycle + 1.728222884 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.633414e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.646913e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.646913e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.159145 sec + 4,117,977,410 cycles # 1.904 GHz + 6,458,802,297 instructions # 1.57 insn per cycle + 2.172057894 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157320E-004 +Relative difference = 2.837296634927675e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 305b2aa7ff..5cb26f1dc5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:35:23 +DATE: 2024-03-01_03:05:40 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.406410e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.571548e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.572717e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.638815 sec - 1,900,407,247 cycles:u # 2.901 GHz (75.09%) - 2,254,354 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.58%) - 5,396,830 stalled-cycles-backend:u # 0.28% backend cycles idle (75.60%) - 2,102,115,004 instructions:u # 1.11 insn per cycle - # 0.00 stalled cycles per insn (75.59%) - 0.685261289 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.234238e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.262824e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.264818e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.529504 sec + 2,311,611,520 cycles # 3.006 GHz + 3,548,053,349 instructions # 1.53 insn per cycle + 0.826491750 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.243278e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.245902e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.245963e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 7.699819 sec - 26,542,369,018 cycles:u # 3.434 GHz (74.95%) - 3,288,397 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 4,842,522 stalled-cycles-backend:u # 0.02% backend cycles idle (75.06%) - 21,115,129,397 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 7.757988929 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.771596e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.800183e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.801376e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.298192 sec + 10,832,117,508 cycles # 3.051 GHz + 23,123,371,744 instructions # 2.13 insn per cycle + 3.609870208 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.571026e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.571408e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.571408e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 35.886395 sec - 125,842,685,634 cycles:u # 3.505 GHz (74.99%) - 13,532,780 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 15,368,715,731 stalled-cycles-backend:u # 12.21% backend cycles idle (75.00%) - 141,501,900,955 instructions:u # 1.12 insn per cycle - # 0.11 stalled cycles per insn (75.00%) - 35.911647079 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21543) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.437828e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.438319e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.438319e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 36.966049 sec + 113,615,073,618 cycles # 3.074 GHz + 144,968,095,911 instructions # 1.28 insn per cycle + 36.970400514 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21301) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140461E-004 -Relative difference = 2.8372991790910424e-07 +Avg ME (F77/C++) = 6.6266731198140450E-004 +Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.508609e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.510771e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.510771e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.682382 sec - 16,441,765,172 cycles:u # 3.495 GHz (75.01%) - 11,758,570 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.01%) - 6,326,199,000 stalled-cycles-backend:u # 38.48% backend cycles idle (75.01%) - 37,548,136,278 instructions:u # 2.28 insn per cycle - # 0.17 stalled cycles per insn (75.01%) - 4.708772467 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.281454e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.284254e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.284254e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.007790 sec + 14,730,075,423 cycles # 2.939 GHz + 37,574,123,368 instructions # 2.55 insn per cycle + 5.012256986 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141220E-004 -Relative difference = 2.837299064562788e-07 +Avg ME (F77/C++) = 6.6266731198141209E-004 +Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.553279e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.563561e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.563561e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.179424 sec - 7,678,063,051 cycles:u # 3.486 GHz (74.95%) - 742,936 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) - 4,343,621,516 stalled-cycles-backend:u # 56.57% backend cycles idle (74.94%) - 12,960,132,017 instructions:u # 1.69 insn per cycle - # 0.34 stalled cycles per insn (74.94%) - 2.206216616 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46575) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.743950e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.758262e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.758262e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.127650 sec + 6,163,100,705 cycles # 2.892 GHz + 13,061,449,928 instructions # 2.12 insn per cycle + 2.132187716 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.460039e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.482215e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.482215e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.743142 sec + 5,059,957,423 cycles # 2.897 GHz + 11,440,000,239 instructions # 2.26 insn per cycle + 1.747501406 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.938377e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.953416e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.953416e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.075865 sec + 3,979,244,183 cycles # 1.914 GHz + 5,942,139,795 instructions # 1.49 insn per cycle + 2.080305520 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 3f2290590f..afca4b7953 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:36:33 +DATE: 2024-03-01_03:06:48 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.381730e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.434033e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.434455e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.520350 sec - 1,520,729,660 cycles:u # 2.822 GHz (73.14%) - 2,385,310 stalled-cycles-frontend:u # 0.16% frontend cycles idle (73.56%) - 5,252,011 stalled-cycles-backend:u # 0.35% backend cycles idle (74.78%) - 1,856,224,661 instructions:u # 1.22 insn per cycle - # 0.00 stalled cycles per insn (74.81%) - 0.565229996 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.244633e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.273686e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.275983e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.531287 sec + 2,311,991,159 cycles # 3.015 GHz + 3,584,221,599 instructions # 1.55 insn per cycle + 0.825938734 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.738254e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.743407e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.743529e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 6.346686 sec - 21,768,032,455 cycles:u # 3.414 GHz (74.98%) - 2,929,134 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 4,917,671 stalled-cycles-backend:u # 0.02% backend cycles idle (75.08%) - 17,435,737,061 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.09%) - 6.404044795 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.793538e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.821908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.823116e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.269849 sec + 10,805,743,512 cycles # 3.068 GHz + 25,084,175,459 instructions # 2.32 insn per cycle + 3.579404730 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.526933e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.527306e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.527306e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 36.235634 sec - 127,055,211,551 cycles:u # 3.504 GHz (75.00%) - 16,943,775 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 15,460,991,830 stalled-cycles-backend:u # 12.17% backend cycles idle (75.00%) - 141,672,992,120 instructions:u # 1.12 insn per cycle - # 0.11 stalled cycles per insn (75.00%) - 36.260772993 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21831) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.412070e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.412546e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.412546e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 37.253529 sec + 114,121,742,420 cycles # 3.069 GHz + 145,689,073,244 instructions # 1.28 insn per cycle + 37.257693750 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:22559) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140461E-004 -Relative difference = 2.8372991790910424e-07 +Avg ME (F77/C++) = 6.6266731198140450E-004 +Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.565168e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.567504e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.567504e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.608055 sec - 16,189,817,583 cycles:u # 3.496 GHz (74.95%) - 4,727,401 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.95%) - 6,417,413,717 stalled-cycles-backend:u # 39.64% backend cycles idle (74.96%) - 37,648,163,335 instructions:u # 2.33 insn per cycle - # 0.17 stalled cycles per insn (74.98%) - 4.634664036 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.198627e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.201180e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.201180e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.136766 sec + 15,152,451,249 cycles # 2.948 GHz + 37,761,291,325 instructions # 2.49 insn per cycle + 5.141156615 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141220E-004 -Relative difference = 2.837299064562788e-07 +Avg ME (F77/C++) = 6.6266731198141209E-004 +Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.770607e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.781417e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.781417e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.118504 sec - 7,454,241,266 cycles:u # 3.481 GHz (74.97%) - 787,228 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 4,288,175,252 stalled-cycles-backend:u # 57.53% backend cycles idle (74.97%) - 12,851,745,046 instructions:u # 1.72 insn per cycle - # 0.33 stalled cycles per insn (74.97%) - 2.144706465 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45645) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.950126e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.965335e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.965335e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.072422 sec + 6,013,210,013 cycles # 2.896 GHz + 12,895,807,400 instructions # 2.14 insn per cycle + 2.076740513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.394633e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.416357e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.416357e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.755119 sec + 5,091,337,522 cycles # 2.895 GHz + 11,446,622,503 instructions # 2.25 insn per cycle + 1.759562583 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.001850e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.017431e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.017431e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.059473 sec + 3,944,538,203 cycles # 1.912 GHz + 5,896,184,476 instructions # 1.49 insn per cycle + 2.063940696 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 667b1207c3..082176c355 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:10:05 +DATE: 2024-03-01_02:33:45 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.559555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.765298e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.766948e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.426954 sec - 1,155,934,779 cycles:u # 2.607 GHz (75.85%) - 2,270,453 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.69%) - 5,599,816 stalled-cycles-backend:u # 0.48% backend cycles idle (75.28%) - 1,588,765,906 instructions:u # 1.37 insn per cycle - # 0.00 stalled cycles per insn (74.63%) - 0.473207237 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.331619e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.392833e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.401451e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.481440 sec + 2,077,514,231 cycles # 2.979 GHz + 3,093,505,744 instructions # 1.49 insn per cycle + 0.777796663 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.702651e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.730089e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.730609e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.615281 sec - 8,762,195,591 cycles:u # 3.319 GHz (74.91%) - 2,598,778 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.13%) - 5,515,649 stalled-cycles-backend:u # 0.06% backend cycles idle (75.03%) - 7,373,798,662 instructions:u # 0.84 insn per cycle - # 0.00 stalled cycles per insn (74.97%) - 2.666169145 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.622317e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.697439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.700567e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.713365 sec + 5,944,272,538 cycles # 3.053 GHz + 12,632,277,461 instructions # 2.13 insn per cycle + 2.004079656 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.471402e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.472463e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.472463e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.642044 sec - 23,349,584,089 cycles:u # 3.504 GHz (74.93%) - 1,322,457 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 2,539,618,789 stalled-cycles-backend:u # 10.88% backend cycles idle (75.01%) - 75,891,173,645 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 6.666703631 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.049682e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.050694e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.050694e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.010109 sec + 24,614,432,061 cycles # 3.072 GHz + 78,126,558,251 instructions # 3.17 insn per cycle + 8.016891762 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.951991e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.970075e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.970075e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.654004 sec - 5,841,823,067 cycles:u # 3.485 GHz (74.75%) - 702,558 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 854,737,495 stalled-cycles-backend:u # 14.63% backend cycles idle (75.19%) - 20,134,073,878 instructions:u # 3.45 insn per cycle - # 0.04 stalled cycles per insn (75.19%) - 1.680033246 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.386833e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.400650e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.400650e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.228676 sec + 6,461,822,382 cycles # 2.894 GHz + 20,120,855,558 instructions # 3.11 insn per cycle + 2.241648353 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.387538e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.397939e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.397939e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.692996 sec - 2,470,623,307 cycles:u # 3.453 GHz (74.55%) - 588,565 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.07%) - 257,432,909 stalled-cycles-backend:u # 10.42% backend cycles idle (75.42%) - 7,058,314,336 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (75.42%) - 0.718886614 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.671811e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.678370e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.678370e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.990019 sec + 2,821,251,649 cycles # 2.839 GHz + 6,989,221,748 instructions # 2.48 insn per cycle + 1.002444816 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.922237e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.931217e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.931217e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.861179 sec + 2,488,986,957 cycles # 2.876 GHz + 6,296,476,670 instructions # 2.53 insn per cycle + 0.887481911 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.534197e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.539839e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.539839e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.078476 sec + 2,048,809,794 cycles # 1.894 GHz + 3,266,667,713 instructions # 1.59 insn per cycle + 1.091634951 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index c41e2519f4..6f564b583c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:55:18 +DATE: 2024-03-01_03:17:05 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.582299e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.759016e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.759016e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 -TOTAL : 0.444663 sec - 1,257,995,614 cycles:u # 2.712 GHz (74.36%) - 3,323,803 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.32%) - 33,750,386 stalled-cycles-backend:u # 2.68% backend cycles idle (74.26%) - 1,681,094,930 instructions:u # 1.34 insn per cycle - # 0.02 stalled cycles per insn (74.22%) - 0.489024988 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.665443e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.315182e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.315182e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.468201 sec + 2,060,292,715 cycles # 2.983 GHz + 3,094,906,819 instructions # 1.50 insn per cycle + 0.750075013 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.268559e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.711027e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.711027e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.448567 sec - 11,570,440,658 cycles:u # 3.326 GHz (74.91%) - 38,666,267 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.90%) - 1,139,194,577 stalled-cycles-backend:u # 9.85% backend cycles idle (74.95%) - 9,919,737,336 instructions:u # 0.86 insn per cycle - # 0.11 stalled cycles per insn (75.01%) - 3.502089428 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.249943e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.466015e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.466015e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.882218 sec + 6,478,461,444 cycles # 3.059 GHz + 12,879,929,349 instructions # 1.99 insn per cycle + 2.174649918 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.475508e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.476558e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.476558e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.632831 sec - 23,294,407,490 cycles:u # 3.500 GHz (75.00%) - 2,385,816 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 2,694,046,041 stalled-cycles-backend:u # 11.57% backend cycles idle (75.00%) - 75,886,020,623 instructions:u # 3.26 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 6.657837903 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.041429e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.042536e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.042536e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.044775 sec + 24,623,818,516 cycles # 3.060 GHz + 78,132,484,739 instructions # 3.17 insn per cycle + 8.049291657 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.920792e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.938871e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.938871e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.661196 sec - 5,849,624,843 cycles:u # 3.474 GHz (74.82%) - 744,753 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.82%) - 890,080,654 stalled-cycles-backend:u # 15.22% backend cycles idle (74.87%) - 20,186,191,059 instructions:u # 3.45 insn per cycle - # 0.04 stalled cycles per insn (75.11%) - 1.687832899 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.498892e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.513186e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.513186e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.197009 sec + 6,464,288,620 cycles # 2.938 GHz + 20,129,426,624 instructions # 3.11 insn per cycle + 2.201352169 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.381518e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.391943e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.391943e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.696738 sec - 2,468,710,150 cycles:u # 3.431 GHz (74.46%) - 573,862 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.64%) - 272,091,914 stalled-cycles-backend:u # 11.02% backend cycles idle (75.18%) - 7,069,772,671 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (75.56%) - 0.722935447 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.703352e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.711063e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.711063e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.973161 sec + 2,827,392,405 cycles # 2.894 GHz + 6,998,075,079 instructions # 2.48 insn per cycle + 0.977561277 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.931885e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.940835e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.940835e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.859317 sec + 2,491,742,914 cycles # 2.887 GHz + 6,305,390,293 instructions # 2.53 insn per cycle + 0.863665296 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.551095e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.557002e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.557002e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.067932 sec + 2,057,227,059 cycles # 1.920 GHz + 3,276,345,738 instructions # 1.59 insn per cycle + 1.072312021 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 1e1eb5616b..66226e8d59 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,182 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_15:05:33 +DATE: 2024-03-01_03:28:45 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.496857e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.768561e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.770072e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.441201 sec - 1,236,286,976 cycles:u # 2.684 GHz (74.02%) - 2,734,282 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.50%) - 38,777,979 stalled-cycles-backend:u # 3.14% backend cycles idle (75.81%) - 1,561,318,017 instructions:u # 1.26 insn per cycle - # 0.02 stalled cycles per insn (75.69%) - 0.484851074 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.308056e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.358553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.363626e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 +TOTAL : 0.461299 sec + 2,006,885,691 cycles # 2.992 GHz + 3,022,532,155 instructions # 1.51 insn per cycle + 0.728549346 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.687980e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.723773e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.724201e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.310015 sec - 11,092,246,100 cycles:u # 3.325 GHz (75.06%) - 27,936,907 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.11%) - 1,142,037,661 stalled-cycles-backend:u # 10.30% backend cycles idle (75.12%) - 8,995,510,132 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (75.11%) - 3.359963375 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.572531e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.646089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.649338e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 +TOTAL : 1.795584 sec + 6,148,728,410 cycles # 3.042 GHz + 12,326,233,623 instructions # 2.00 insn per cycle + 2.078967785 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.474042e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.475089e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.475089e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.634873 sec - 23,324,491,000 cycles:u # 3.504 GHz (74.99%) - 1,298,892 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 2,500,617,481 stalled-cycles-backend:u # 10.72% backend cycles idle (75.01%) - 75,884,664,773 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 6.659063429 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.053824e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.054841e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.054841e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 7.994149 sec + 24,620,138,866 cycles # 3.079 GHz + 78,125,377,108 instructions # 3.17 insn per cycle + 7.998228624 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.912450e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.930442e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.930442e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.660516 sec - 5,860,492,825 cycles:u # 3.483 GHz (74.80%) - 714,588 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.80%) - 892,887,399 stalled-cycles-backend:u # 15.24% backend cycles idle (74.84%) - 20,178,833,108 instructions:u # 3.44 insn per cycle - # 0.04 stalled cycles per insn (75.09%) - 1.684440329 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.346279e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.360483e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.360483e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 +TOTAL : 2.242069 sec + 6,461,640,731 cycles # 2.878 GHz + 20,121,052,869 instructions # 3.11 insn per cycle + 2.246196034 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.387190e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.397750e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.397750e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.693065 sec - 2,473,715,053 cycles:u # 3.459 GHz (74.36%) - 511,829 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.70%) - 267,325,215 stalled-cycles-backend:u # 10.81% backend cycles idle (75.18%) - 7,067,457,267 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (75.41%) - 0.716919116 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.685316e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.692321e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.692321e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.982986 sec + 2,822,415,829 cycles # 2.862 GHz + 6,987,486,660 instructions # 2.48 insn per cycle + 0.987025186 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.936405e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.945906e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.945906e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.855808 sec + 2,484,894,865 cycles # 2.892 GHz + 6,291,816,709 instructions # 2.53 insn per cycle + 0.859867773 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.547512e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.553394e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.553394e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 1.069890 sec + 2,051,026,977 cycles # 1.912 GHz + 3,263,937,559 instructions # 1.59 insn per cycle + 1.073863100 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 5fc39c9ab7..29def3747b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,187 +1,227 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_15:01:48 +DATE: 2024-03-01_03:22:13 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.593091e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.763833e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.764954e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 -TOTAL : 0.438026 sec - 1,247,714,977 cycles:u # 2.700 GHz (74.14%) - 3,183,765 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.84%) - 40,804,785 stalled-cycles-backend:u # 3.27% backend cycles idle (75.76%) - 1,566,002,893 instructions:u # 1.26 insn per cycle - # 0.03 stalled cycles per insn (75.65%) - 0.481169073 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.727516e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.381665e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.387640e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.463988 sec + 2,009,660,419 cycles # 2.987 GHz + 3,043,780,102 instructions # 1.51 insn per cycle + 0.732052318 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.300030e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.726516e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.726958e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.412260 sec - 11,498,941,344 cycles:u # 3.341 GHz (75.04%) - 39,222,891 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.01%) - 1,145,420,210 stalled-cycles-backend:u # 9.96% backend cycles idle (74.90%) - 9,903,387,147 instructions:u # 0.86 insn per cycle - # 0.12 stalled cycles per insn (74.90%) - 3.462061630 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.463642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.641012e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.644220e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.829361 sec + 6,179,090,687 cycles # 3.005 GHz + 13,497,023,724 instructions # 2.18 insn per cycle + 2.119489112 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.466591e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.467646e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.467646e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.654970 sec - 23,382,894,327 cycles:u # 3.502 GHz (74.96%) - 1,285,649 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 2,698,624,585 stalled-cycles-backend:u # 11.54% backend cycles idle (74.96%) - 75,918,146,842 instructions:u # 3.25 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 6.679227111 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.033662e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.034665e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.034665e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.072340 sec + 24,646,233,583 cycles # 3.055 GHz + 78,130,465,005 instructions # 3.17 insn per cycle + 8.076398723 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863266294753E-004 +Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.948738e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.967099e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.967099e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.654520 sec - 5,849,988,183 cycles:u # 3.489 GHz (74.62%) - 733,925 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.86%) - 893,308,966 stalled-cycles-backend:u # 15.27% backend cycles idle (75.19%) - 20,137,311,738 instructions:u # 3.44 insn per cycle - # 0.04 stalled cycles per insn (75.20%) - 1.678437046 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.437406e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.451013e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.451013e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.213064 sec + 6,463,144,308 cycles # 2.916 GHz + 20,121,040,605 instructions # 3.11 insn per cycle + 2.217197026 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861460025036E-004 +Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.386144e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.396641e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.396641e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.693313 sec - 2,474,987,335 cycles:u # 3.461 GHz (74.36%) - 561,279 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.84%) - 260,961,501 stalled-cycles-backend:u # 10.54% backend cycles idle (75.32%) - 7,061,622,493 instructions:u # 2.85 insn per cycle - # 0.04 stalled cycles per insn (75.41%) - 0.716991593 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.690865e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.698060e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.698060e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.977816 sec + 2,816,932,981 cycles # 2.871 GHz + 6,987,870,279 instructions # 2.48 insn per cycle + 0.981891147 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.925443e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934689e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934689e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.859893 sec + 2,483,713,955 cycles # 2.877 GHz + 6,295,351,555 instructions # 2.53 insn per cycle + 0.863911879 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271938174574524E-004 +Relative difference = 2.7544470208782633e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.552325e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558086e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558086e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.064299 sec + 2,046,605,748 cycles # 1.917 GHz + 3,265,707,472 instructions # 1.60 insn per cycle + 1.068273671 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952779373838E-004 +Relative difference = 4.193891735414155e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 971e76956f..50b444080d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:10:32 +DATE: 2024-03-01_02:34:14 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.566678e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.761241e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.762888e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.424948 sec - 1,151,762,649 cycles:u # 2.598 GHz (74.97%) - 2,181,608 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.88%) - 5,563,008 stalled-cycles-backend:u # 0.48% backend cycles idle (74.76%) - 1,556,455,425 instructions:u # 1.35 insn per cycle - # 0.00 stalled cycles per insn (74.71%) - 0.469666085 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.321381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.374979e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.380502e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.482038 sec + 2,083,496,491 cycles # 2.987 GHz + 3,090,021,729 instructions # 1.48 insn per cycle + 0.780369869 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.731093e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.758598e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.759042e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.607659 sec - 8,768,629,457 cycles:u # 3.332 GHz (74.72%) - 2,624,104 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.93%) - 5,576,888 stalled-cycles-backend:u # 0.06% backend cycles idle (74.95%) - 7,487,633,184 instructions:u # 0.85 insn per cycle - # 0.00 stalled cycles per insn (75.08%) - 2.658904547 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.505248e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.577137e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.580211e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.719742 sec + 5,952,430,615 cycles # 3.047 GHz + 11,750,571,480 instructions # 1.97 insn per cycle + 2.009992190 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.471528e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.472574e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.472574e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.641416 sec - 23,353,702,510 cycles:u # 3.505 GHz (74.93%) - 1,362,469 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 2,617,885,041 stalled-cycles-backend:u # 11.21% backend cycles idle (75.03%) - 75,791,709,024 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 6.666120159 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.039243e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.040268e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.040268e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.050624 sec + 24,577,706,132 cycles # 3.054 GHz + 77,857,469,800 instructions # 3.17 insn per cycle + 8.057072902 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866108667618E-004 -Relative difference = 5.871505118544242e-08 +Avg ME (F77/C++) = 6.6274866268634797E-004 +Relative difference = 5.630135835748959e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.955947e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.973665e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.973665e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.653154 sec - 5,851,964,104 cycles:u # 3.492 GHz (74.57%) - 716,574 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.80%) - 873,414,641 stalled-cycles-backend:u # 14.93% backend cycles idle (75.18%) - 20,127,366,441 instructions:u # 3.44 insn per cycle - # 0.04 stalled cycles per insn (75.18%) - 1.679093232 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.236562e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.248995e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.248995e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.274363 sec + 6,415,212,085 cycles # 2.816 GHz + 20,086,390,532 instructions # 3.13 insn per cycle + 2.288238797 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274861465384638E-004 +Relative difference = 2.211071647257023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.371919e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.382165e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.382165e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.697310 sec - 2,486,415,833 cycles:u # 3.455 GHz (74.08%) - 636,088 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.64%) - 320,151,616 stalled-cycles-backend:u # 12.88% backend cycles idle (75.56%) - 7,058,223,186 instructions:u # 2.84 insn per cycle - # 0.05 stalled cycles per insn (75.57%) - 0.723213530 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11569) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.636656e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.643300e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.643300e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.010969 sec + 2,918,129,602 cycles # 2.878 GHz + 7,130,827,098 instructions # 2.44 insn per cycle + 1.024648825 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271939668077068E-004 +Relative difference = 5.008498817890231e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.848024e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.856123e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.856123e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.895519 sec + 2,583,274,132 cycles # 2.873 GHz + 6,439,451,842 instructions # 2.49 insn per cycle + 0.910176239 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627194e-04 +Avg ME (F77/C++) = 6.6271939668077068E-004 +Relative difference = 5.008498817890231e-09 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.488982e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.494377e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.494377e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.109477 sec + 2,120,739,457 cycles # 1.905 GHz + 3,428,489,642 instructions # 1.62 insn per cycle + 1.120804955 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2911) (512y: 22) (512z: 9647) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952032322112E-004 +Relative difference = 3.066639970473621e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 9b9d6daf6d..3e610d68fd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:37:41 +DATE: 2024-03-01_03:07:56 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.588992e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.775838e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.777533e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.424815 sec - 1,147,914,206 cycles:u # 2.596 GHz (75.50%) - 2,137,582 stalled-cycles-frontend:u # 0.19% frontend cycles idle (76.49%) - 4,985,293 stalled-cycles-backend:u # 0.43% backend cycles idle (76.40%) - 1,569,527,290 instructions:u # 1.37 insn per cycle - # 0.00 stalled cycles per insn (75.34%) - 0.469563386 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.548079e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.594396e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.599390e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.487762 sec + 2,117,397,644 cycles # 2.979 GHz + 3,170,491,357 instructions # 1.50 insn per cycle + 0.771619877 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.702922e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.730822e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.731253e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.628763 sec - 8,748,097,863 cycles:u # 3.316 GHz (74.96%) - 2,437,447 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.13%) - 4,749,082 stalled-cycles-backend:u # 0.05% backend cycles idle (75.19%) - 7,394,823,172 instructions:u # 0.85 insn per cycle - # 0.00 stalled cycles per insn (75.12%) - 2.677900313 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.728616e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.789567e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.792128e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.852993 sec + 6,403,206,858 cycles # 3.066 GHz + 13,984,822,985 instructions # 2.18 insn per cycle + 2.145838793 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.839569e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.840163e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.840163e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 -TOTAL : 28.090715 sec - 98,526,254,999 cycles:u # 3.505 GHz (74.98%) - 422,342,769 stalled-cycles-frontend:u # 0.43% frontend cycles idle (75.00%) - 5,651,142,068 stalled-cycles-backend:u # 5.74% backend cycles idle (75.01%) - 134,061,336,373 instructions:u # 1.36 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 28.115646344 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16252) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.747654e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.748466e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.748466e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 28.541681 sec + 87,683,123,741 cycles # 3.072 GHz + 135,626,627,328 instructions # 1.55 insn per cycle + 28.545959109 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275340697351248E-004 -Relative difference = 1.052203199451665e-08 +Avg ME (F77/C++) = 6.6275340277317796E-004 +Relative difference = 4.184328521943034e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.281572e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.294180e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.294180e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 1.986665 sec - 6,998,627,436 cycles:u # 3.483 GHz (74.92%) - 4,972,896 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.92%) - 3,166,433,191 stalled-cycles-backend:u # 45.24% backend cycles idle (74.92%) - 19,197,411,512 instructions:u # 2.74 insn per cycle - # 0.16 stalled cycles per insn (74.95%) - 2.012844995 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.148984e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.161699e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.161699e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.302428 sec + 6,776,067,855 cycles # 2.939 GHz + 19,386,467,667 instructions # 2.86 insn per cycle + 2.306810458 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857053714997E-004 -Relative difference = 4.445554471174176e-08 +Avg ME (F77/C++) = 6.6274862707273868E-004 +Relative difference = 4.0849182767952624e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.474226e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.478244e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.478244e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.118881 sec - 3,963,587,799 cycles:u # 3.473 GHz (74.82%) - 618,981 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.79%) - 2,252,906,025 stalled-cycles-backend:u # 56.84% backend cycles idle (74.79%) - 6,771,275,605 instructions:u # 1.71 insn per cycle - # 0.33 stalled cycles per insn (74.83%) - 1.145186082 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48607) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.506728e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.512574e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.512574e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.096393 sec + 3,175,310,502 cycles # 2.890 GHz + 6,807,675,147 instructions # 2.14 insn per cycle + 1.100557110 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735722101156E-004 -Relative difference = 6.454990161554483e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731558747466E-004 +Relative difference = 2.3520194007978538e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.815661e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.823746e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.823746e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.911313 sec + 2,641,911,907 cycles # 2.888 GHz + 5,985,989,672 instructions # 2.27 insn per cycle + 0.915610697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731558747466E-004 +Relative difference = 2.3520194007978538e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.523255e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.528884e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.528884e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.084772 sec + 2,074,111,548 cycles # 1.906 GHz + 3,500,542,355 instructions # 1.69 insn per cycle + 1.089027435 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5197) (512y: 3) (512z:44822) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272750363879224E-004 +Relative difference = 5.490631193034436e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 3abe53f650..f668536073 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:38:32 +DATE: 2024-03-01_03:08:48 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.550920e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.752610e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.754230e+05 ) sec^-1 -MeanMatrixElemValue = ( 7.535870e-02 +- 4.279978e-02 ) GeV^-4 -TOTAL : 0.432609 sec - 1,170,709,207 cycles:u # 2.632 GHz (75.51%) - 2,245,635 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.63%) - 5,034,999 stalled-cycles-backend:u # 0.43% backend cycles idle (75.16%) - 1,568,756,660 instructions:u # 1.34 insn per cycle - # 0.00 stalled cycles per insn (74.90%) - 0.477253690 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.541557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.588429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.593399e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.485011 sec + 2,123,544,393 cycles # 3.007 GHz + 3,219,525,664 instructions # 1.52 insn per cycle + 0.766064420 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.719669e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.747428e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.747865e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.763201e+01 +- 6.205778e+01 ) GeV^-4 -TOTAL : 2.614994 sec - 8,774,192,257 cycles:u # 3.326 GHz (74.88%) - 2,623,817 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.96%) - 5,376,553 stalled-cycles-backend:u # 0.06% backend cycles idle (75.13%) - 7,407,357,792 instructions:u # 0.84 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 2.665747459 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.637487e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.696462e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.698981e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.858325 sec + 6,401,876,626 cycles # 3.056 GHz + 13,834,352,039 instructions # 2.16 insn per cycle + 2.151127842 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 5.805792e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.806376e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.806376e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 -TOTAL : 28.253870 sec - 99,085,121,864 cycles:u # 3.504 GHz (74.99%) - 403,753,080 stalled-cycles-frontend:u # 0.41% frontend cycles idle (74.99%) - 5,989,522,868 stalled-cycles-backend:u # 6.04% backend cycles idle (74.99%) - 133,996,319,049 instructions:u # 1.35 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 28.278762237 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16105) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.762616e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.763465e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.763465e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 28.469746 sec + 87,566,965,728 cycles # 3.076 GHz + 135,909,521,186 instructions # 1.55 insn per cycle + 28.473960910 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15910) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275346486299042E-004 -Relative difference = 5.301670926116898e-08 +Avg ME (F77/C++) = 6.6275352674967369E-004 +Relative difference = 4.0361421941458736e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.151371e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.163159e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.163159e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 2.018070 sec - 7,109,141,926 cycles:u # 3.483 GHz (74.92%) - 6,794,771 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.92%) - 2,870,898,971 stalled-cycles-backend:u # 40.38% backend cycles idle (74.92%) - 19,267,656,430 instructions:u # 2.71 insn per cycle - # 0.15 stalled cycles per insn (74.95%) - 2.044423316 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.141246e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.153468e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.153468e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.304055 sec + 6,854,008,563 cycles # 2.972 GHz + 19,438,508,034 instructions # 2.84 insn per cycle + 2.308246423 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857044990032E-004 -Relative difference = 4.4587192899226015e-08 +Avg ME (F77/C++) = 6.6274862764021530E-004 +Relative difference = 4.170542995014107e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.507945e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.512087e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.512087e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.093716 sec - 3,878,852,536 cycles:u # 3.474 GHz (74.94%) - 558,959 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.92%) - 2,192,907,076 stalled-cycles-backend:u # 56.53% backend cycles idle (74.92%) - 6,707,121,317 instructions:u # 1.73 insn per cycle - # 0.33 stalled cycles per insn (74.93%) - 1.119771463 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47398) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.543089e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.548736e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.548736e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.070827 sec + 3,111,432,280 cycles # 2.896 GHz + 6,718,585,544 instructions # 2.16 insn per cycle + 1.075017514 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735755491807E-004 -Relative difference = 6.404606472340801e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731651051409E-004 +Relative difference = 2.4912983202981302e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.837542e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.845711e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.845711e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.900474 sec + 2,630,752,588 cycles # 2.910 GHz + 5,969,340,561 instructions # 2.27 insn per cycle + 0.904647261 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627273e-04 +Avg ME (F77/C++) = 6.6272731651051409E-004 +Relative difference = 2.4912983202981302e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.526039e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.531935e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.531935e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.083027 sec + 2,083,719,160 cycles # 1.918 GHz + 3,494,111,175 instructions # 1.68 insn per cycle + 1.087325959 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4161) (512y: 4) (512z:44465) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272750384530066E-004 +Relative difference = 5.80223501432476e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 2ab49e3bd6..8553820a52 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:10:58 +DATE: 2024-03-01_02:34:44 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.425592e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.587766e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.588951e+04 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.638584 sec - 1,919,574,096 cycles:u # 2.923 GHz (74.70%) - 2,421,805 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.49%) - 5,597,340 stalled-cycles-backend:u # 0.29% backend cycles idle (74.97%) - 2,103,364,228 instructions:u # 1.10 insn per cycle - # 0.00 stalled cycles per insn (75.54%) - 0.684865412 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.473478e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.502235e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.504525e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.522907 sec + 2,248,416,129 cycles # 2.981 GHz + 3,483,881,112 instructions # 1.55 insn per cycle + 0.829467781 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.244551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.247201e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.247258e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 7.688882 sec - 26,503,223,516 cycles:u # 3.435 GHz (75.02%) - 3,266,324 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 4,510,741 stalled-cycles-backend:u # 0.02% backend cycles idle (74.96%) - 21,089,006,992 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 7.745515345 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.123898e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.157734e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.159130e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.035491 sec + 10,039,386,860 cycles # 3.052 GHz + 22,522,898,713 instructions # 2.24 insn per cycle + 3.349083086 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.210580e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.211471e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.211471e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.426674 sec - 26,082,399,852 cycles:u # 3.501 GHz (74.98%) - 8,305,754 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.98%) - 3,453,424,669 stalled-cycles-backend:u # 13.24% backend cycles idle (74.98%) - 82,445,002,864 instructions:u # 3.16 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 7.451680513 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.952639e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953615e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953615e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.409354 sec + 25,927,870,734 cycles # 3.082 GHz + 79,436,480,305 instructions # 3.06 insn per cycle + 8.416137774 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.104934e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.109669e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.109669e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.220409 sec - 11,345,813,930 cycles:u # 3.499 GHz (74.79%) - 3,530,047 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.90%) - 1,380,647,933 stalled-cycles-backend:u # 12.17% backend cycles idle (75.09%) - 38,530,758,808 instructions:u # 3.40 insn per cycle - # 0.04 stalled cycles per insn (75.09%) - 3.246786684 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.739028e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742372e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742372e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.395641 sec + 12,641,926,900 cycles # 2.873 GHz + 38,549,360,435 instructions # 3.05 insn per cycle + 4.411574958 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.216330e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.218977e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.218977e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.356050 sec - 4,800,488,094 cycles:u # 3.482 GHz (74.92%) - 704,363 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.05%) - 500,136,585 stalled-cycles-backend:u # 10.42% backend cycles idle (75.05%) - 13,595,981,507 instructions:u # 2.83 insn per cycle - # 0.04 stalled cycles per insn (75.06%) - 1.382171821 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.720558e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.737987e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.737987e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.889905 sec + 5,503,418,397 cycles # 2.905 GHz + 13,481,227,468 instructions # 2.45 insn per cycle + 1.901949052 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.817789e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.841302e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.841302e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.679659 sec + 4,858,057,374 cycles # 2.885 GHz + 12,135,455,571 instructions # 2.50 insn per cycle + 1.694768152 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.171224e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.183880e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.183880e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.297248 sec + 4,143,595,621 cycles # 1.801 GHz + 6,336,694,490 instructions # 1.53 insn per cycle + 2.312628428 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index f26a504a43..44d560fb63 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-03_14:11:34 +DATE: 2024-03-01_02:35:21 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.391332e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.447459e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.447953e+05 ) sec^-1 -MeanMatrixElemValue = ( 3.804675e-02 +- 2.047289e-02 ) GeV^-4 -TOTAL : 0.516393 sec - 1,446,030,568 cycles:u # 2.710 GHz (76.02%) - 2,133,442 stalled-cycles-frontend:u # 0.15% frontend cycles idle (76.04%) - 5,617,954 stalled-cycles-backend:u # 0.39% backend cycles idle (75.92%) - 1,827,006,654 instructions:u # 1.26 insn per cycle - # 0.00 stalled cycles per insn (74.74%) - 0.563344751 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.474402e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.502829e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505143e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.522485 sec + 2,266,664,443 cycles # 3.011 GHz + 3,552,942,464 instructions # 1.57 insn per cycle + 0.824080628 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.737172e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.742314e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.742436e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.694853e+01 +- 6.364791e+01 ) GeV^-4 -TOTAL : 6.349317 sec - 21,791,711,777 cycles:u # 3.417 GHz (74.97%) - 3,055,571 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 4,679,620 stalled-cycles-backend:u # 0.02% backend cycles idle (75.02%) - 17,472,685,439 instructions:u # 0.80 insn per cycle - # 0.00 stalled cycles per insn (75.09%) - 6.404127880 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.147340e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181695e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.182993e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.023944 sec + 10,029,910,184 cycles # 3.059 GHz + 21,497,951,661 instructions # 2.14 insn per cycle + 3.338904131 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.204504e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.205382e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.205382e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.446882 sec - 26,151,941,870 cycles:u # 3.502 GHz (74.94%) - 11,506,078 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.95%) - 3,429,320,097 stalled-cycles-backend:u # 13.11% backend cycles idle (74.99%) - 82,356,606,752 instructions:u # 3.15 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 7.471566166 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.924823e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.925747e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925747e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.531114 sec + 25,939,606,781 cycles # 3.040 GHz + 79,447,311,630 instructions # 3.06 insn per cycle + 8.537643841 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.110923e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.115571e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.115571e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.216374 sec - 11,331,264,346 cycles:u # 3.499 GHz (74.88%) - 3,860,751 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.06%) - 1,226,594,077 stalled-cycles-backend:u # 10.82% backend cycles idle (75.06%) - 38,556,360,425 instructions:u # 3.40 insn per cycle - # 0.03 stalled cycles per insn (75.06%) - 3.242722707 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.758654e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.761985e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.761985e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.372440 sec + 12,693,692,693 cycles # 2.901 GHz + 38,521,475,204 instructions # 3.03 insn per cycle + 4.385193423 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.221585e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.224270e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.224270e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.350107 sec - 4,758,515,367 cycles:u # 3.466 GHz (74.95%) - 982,235 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) - 451,795,587 stalled-cycles-backend:u # 9.49% backend cycles idle (74.95%) - 13,618,792,974 instructions:u # 2.86 insn per cycle - # 0.03 stalled cycles per insn (74.95%) - 1.376348821 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10908) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.635318e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.652109e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.652109e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.908191 sec + 5,531,901,200 cycles # 2.893 GHz + 13,605,961,475 instructions # 2.46 insn per cycle + 1.920337987 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.704499e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.725961e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.725961e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.699452 sec + 4,910,284,170 cycles # 2.883 GHz + 12,271,024,564 instructions # 2.50 insn per cycle + 1.712563313 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.567240e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.580886e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.580886e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.177959 sec + 4,164,411,217 cycles # 1.910 GHz + 6,442,301,345 instructions # 1.55 insn per cycle + 2.190574077 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276857E-004 +Relative difference = 2.956342832710188e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 7372ffa56a..93119c7539 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-03_14:13:32 +DATE: 2024-03-01_02:37:42 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.010631e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.015965e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.016022e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.444933 sec - 32,646,179,524 cycles:u # 3.456 GHz (74.93%) - 3,576,067 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 6,600,395 stalled-cycles-backend:u # 0.02% backend cycles idle (74.99%) - 25,762,478,293 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 9.495322400 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.065457e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.065836e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.065940e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.448496 sec + 8,082,390,398 cycles # 2.946 GHz + 16,852,562,382 instructions # 2.09 insn per cycle + 2.848455369 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.553385e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.557014e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.557039e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 8.969163 sec - 30,985,581,306 cycles:u # 3.449 GHz (75.00%) - 3,676,317 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 6,175,773 stalled-cycles-backend:u # 0.02% backend cycles idle (74.93%) - 24,530,344,469 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (74.96%) - 9.014444906 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.245006e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.247251e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.247453e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 4.002127 sec + 13,348,526,839 cycles # 3.088 GHz + 31,140,905,358 instructions # 2.33 insn per cycle + 4.382097820 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.024281e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.024308e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.024308e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.156094 sec - 18,123,944,756 cycles:u # 3.500 GHz (74.97%) - 25,307,192 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.97%) - 2,201,506,124 stalled-cycles-backend:u # 12.15% backend cycles idle (74.97%) - 55,196,127,858 instructions:u # 3.05 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 5.180638609 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.053587e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.053836e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.053836e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.566168 sec + 18,831,689,747 cycles # 2.868 GHz + 53,916,332,004 instructions # 2.86 insn per cycle + 6.572689464 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.242509e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.242634e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.242634e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.355912 sec - 8,324,081,906 cycles:u # 3.500 GHz (74.90%) - 1,258,174 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) - 790,392,106 stalled-cycles-backend:u # 9.50% backend cycles idle (75.11%) - 27,094,079,183 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (74.95%) - 2.381888397 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.663489e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.663581e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.663581e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.182674 sec + 9,806,871,766 cycles # 3.081 GHz + 27,093,022,297 instructions # 2.76 insn per cycle + 3.192772007 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.215964e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.216635e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.216635e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.014015 sec - 3,603,710,395 cycles:u # 3.477 GHz (74.59%) - 959,353 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.92%) - 291,639,930 stalled-cycles-backend:u # 8.09% backend cycles idle (75.25%) - 9,572,736,438 instructions:u # 2.66 insn per cycle - # 0.03 stalled cycles per insn (75.31%) - 1.039798359 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.630162e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.630605e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.630605e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.462430 sec + 4,231,767,010 cycles # 2.892 GHz + 9,562,001,834 instructions # 2.26 insn per cycle + 1.472832936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.135973e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.136556e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.136556e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.282131 sec + 3,734,243,960 cycles # 2.905 GHz + 8,486,594,514 instructions # 2.27 insn per cycle + 1.294140643 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.702281e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.702851e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.702851e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.432645 sec + 2,701,519,987 cycles # 1.882 GHz + 4,274,080,381 instructions # 1.58 insn per cycle + 1.444722496 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 0009f2aa16..7163808f45 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-03_14:55:46 +DATE: 2024-03-01_03:17:34 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.066117e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.066962e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.066962e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.400568 sec - 32,571,712,804 cycles:u # 3.457 GHz (74.98%) - 3,816,546 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 10,306,318 stalled-cycles-backend:u # 0.03% backend cycles idle (74.96%) - 25,702,777,180 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 9.451158645 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.068445e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.069395e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.069395e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.373786 sec + 8,212,794,649 cycles # 3.050 GHz + 17,373,508,782 instructions # 2.12 insn per cycle + 2.749788140 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.555020e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.558714e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.558714e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 8.996817 sec - 31,094,202,940 cycles:u # 3.448 GHz (74.98%) - 4,612,912 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 60,925,647 stalled-cycles-backend:u # 0.20% backend cycles idle (74.97%) - 24,633,781,700 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 9.045426867 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.191805e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.223957e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.223957e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.992060 sec + 13,207,906,873 cycles # 3.062 GHz + 30,525,969,027 instructions # 2.31 insn per cycle + 4.371813741 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.018995e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.019024e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.019024e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.182058 sec - 18,215,591,702 cycles:u # 3.500 GHz (74.94%) - 24,842,932 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.94%) - 2,236,448,537 stalled-cycles-backend:u # 12.28% backend cycles idle (74.94%) - 55,230,783,802 instructions:u # 3.03 insn per cycle - # 0.04 stalled cycles per insn (74.96%) - 5.207347443 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.148706e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.148931e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.148931e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.484661 sec + 18,737,465,302 cycles # 2.888 GHz + 53,915,906,594 instructions # 2.88 insn per cycle + 6.488680620 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.246982e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.247111e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.247111e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.351645 sec - 8,293,281,219 cycles:u # 3.493 GHz (74.99%) - 576,460 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.07%) - 728,986,715 stalled-cycles-backend:u # 8.79% backend cycles idle (75.07%) - 27,045,320,734 instructions:u # 3.26 insn per cycle - # 0.03 stalled cycles per insn (75.07%) - 2.377713290 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.664837e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.664944e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.664944e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.177972 sec + 9,794,551,146 cycles # 3.079 GHz + 27,093,049,280 instructions # 2.77 insn per cycle + 3.182112356 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.249531e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.250288e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.250288e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.007623 sec - 3,594,206,633 cycles:u # 3.491 GHz (74.73%) - 1,276,276 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.02%) - 288,960,146 stalled-cycles-backend:u # 8.04% backend cycles idle (75.14%) - 9,597,826,038 instructions:u # 2.67 insn per cycle - # 0.03 stalled cycles per insn (75.15%) - 1.033168851 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.541461e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.541883e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.541883e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.495047 sec + 4,300,282,840 cycles # 2.870 GHz + 9,561,701,370 instructions # 2.22 insn per cycle + 1.499121189 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.118490e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.119048e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.119048e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.287264 sec + 3,730,461,014 cycles # 2.891 GHz + 8,485,603,542 instructions # 2.27 insn per cycle + 1.291227222 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.742786e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.743427e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.743427e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.415968 sec + 2,690,639,160 cycles # 1.896 GHz + 4,273,336,878 instructions # 1.59 insn per cycle + 1.420067464 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 0a6a2db2de..fcaae9673e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-03_14:15:12 +DATE: 2024-03-01_02:38:46 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.061342e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.067173e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.067226e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.363420 sec - 32,427,252,088 cycles:u # 3.456 GHz (74.94%) - 3,678,760 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 6,561,717 stalled-cycles-backend:u # 0.02% backend cycles idle (75.04%) - 25,590,076,713 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 9.410773685 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.066781e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.067205e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.067339e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.446944 sec + 8,408,759,874 cycles # 3.068 GHz + 18,673,492,162 instructions # 2.22 insn per cycle + 2.843675081 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.558322e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.561699e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.561723e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 8.970763 sec - 31,031,440,441 cycles:u # 3.452 GHz (75.02%) - 3,594,598 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.06%) - 7,127,107 stalled-cycles-backend:u # 0.02% backend cycles idle (75.00%) - 24,539,030,272 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 9.017990720 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.258123e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.260337e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.260588e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.986190 sec + 13,309,313,958 cycles # 3.084 GHz + 29,253,936,467 instructions # 2.20 insn per cycle + 4.370982628 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.027524e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.027551e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.027551e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.139911 sec - 18,076,730,884 cycles:u # 3.502 GHz (74.91%) - 27,247,063 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.97%) - 2,156,497,484 stalled-cycles-backend:u # 11.93% backend cycles idle (75.04%) - 55,173,526,576 instructions:u # 3.05 insn per cycle - # 0.04 stalled cycles per insn (75.05%) - 5.164392308 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.505940e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.506196e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.506196e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.219195 sec + 18,809,079,145 cycles # 3.025 GHz + 53,925,834,666 instructions # 2.87 insn per cycle + 6.232860023 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.239128e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.239256e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.239256e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.359467 sec - 8,324,328,252 cycles:u # 3.495 GHz (74.84%) - 1,018,855 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.75%) - 810,197,061 stalled-cycles-backend:u # 9.73% backend cycles idle (74.92%) - 27,094,690,757 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (75.14%) - 2.385087332 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.661174e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.661266e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.661266e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.189478 sec + 9,805,870,159 cycles # 3.076 GHz + 27,091,831,447 instructions # 2.76 insn per cycle + 3.203897537 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.196329e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.196991e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.196991e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.017365 sec - 3,618,316,263 cycles:u # 3.481 GHz (74.61%) - 1,889,874 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.43%) - 305,653,445 stalled-cycles-backend:u # 8.45% backend cycles idle (74.81%) - 9,597,446,711 instructions:u # 2.65 insn per cycle - # 0.03 stalled cycles per insn (75.39%) - 1.043068302 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84231) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.622791e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.623217e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.623217e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.464714 sec + 4,224,699,489 cycles # 2.882 GHz + 9,562,401,622 instructions # 2.26 insn per cycle + 1.476328883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.104704e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.105332e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.105332e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.294499 sec + 3,723,740,700 cycles # 2.874 GHz + 8,486,051,495 instructions # 2.28 insn per cycle + 1.308410916 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.737812e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.738457e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.738457e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.421818 sec + 2,699,411,216 cycles # 1.899 GHz + 4,277,531,970 instructions # 1.58 insn per cycle + 1.435104148 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 9f7921db81..e89ab34326 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-03_14:16:52 +DATE: 2024-03-01_02:39:49 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.808345e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.812077e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.812115e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.256794e-06 +- 4.775721e-07 ) GeV^-6 -TOTAL : 4.492549 sec - 15,357,555,686 cycles:u # 3.403 GHz (75.00%) - 2,789,283 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) - 6,530,614 stalled-cycles-backend:u # 0.04% backend cycles idle (75.02%) - 12,484,001,278 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.09%) - 4.539017619 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.768224e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.769082e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.769342e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.706494 sec + 5,724,877,835 cycles # 2.946 GHz + 11,350,286,337 instructions # 1.98 insn per cycle + 2.064496697 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.381501e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.397017e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.397171e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.930014e-02 +- 1.363569e-02 ) GeV^-6 -TOTAL : 4.646461 sec - 15,935,601,520 cycles:u # 3.416 GHz (74.91%) - 2,847,425 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.97%) - 7,051,629 stalled-cycles-backend:u # 0.04% backend cycles idle (74.95%) - 12,917,649,233 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 4.690156490 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.316243e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.317022e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.317120e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.926202 sec + 6,794,636,243 cycles # 3.076 GHz + 13,931,883,029 instructions # 2.05 insn per cycle + 2.265774235 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.105909e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.105943e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.105943e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.775823 sec - 16,793,710,535 cycles:u # 3.500 GHz (74.99%) - 14,312,647 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.99%) - 1,786,126,710 stalled-cycles-backend:u # 10.64% backend cycles idle (74.99%) - 51,809,018,697 instructions:u # 3.09 insn per cycle - # 0.03 stalled cycles per insn (74.99%) - 4.800324805 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.967764e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.968029e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.968029e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.899633 sec + 18,012,008,843 cycles # 3.055 GHz + 53,588,806,253 instructions # 2.98 insn per cycle + 5.906269981 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087414119E-003 -Relative difference = 2.1196409216982896e-08 +Avg ME (F77/C++) = 9.8479612087541066E-003 +Relative difference = 2.1197698286506752e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.600191e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.600727e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.600727e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.149306 sec - 4,082,382,206 cycles:u # 3.485 GHz (74.76%) - 1,225,467 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.74%) - 423,325,187 stalled-cycles-backend:u # 10.37% backend cycles idle (74.55%) - 13,835,448,548 instructions:u # 3.39 insn per cycle - # 0.03 stalled cycles per insn (74.90%) - 1.175151715 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.554445e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.554907e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.554907e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.492504 sec + 4,596,969,768 cycles # 3.077 GHz + 13,763,413,131 instructions # 2.99 insn per cycle + 1.508036951 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546896527003E-003 +Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.041507e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.041782e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.041782e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.508518 sec - 1,834,426,422 cycles:u # 3.458 GHz (74.41%) - 689,353 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.37%) - 157,999,464 stalled-cycles-backend:u # 8.61% backend cycles idle (74.48%) - 4,889,172,809 instructions:u # 2.67 insn per cycle - # 0.03 stalled cycles per insn (74.93%) - 0.533897559 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.129307e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.130988e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.130988e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.749250 sec + 2,146,538,234 cycles # 2.864 GHz + 4,817,770,938 instructions # 2.24 insn per cycle + 0.763621351 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.184924e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.187225e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.187225e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.652928 sec + 1,865,233,671 cycles # 2.849 GHz + 4,274,819,205 instructions # 2.29 insn per cycle + 0.666710238 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.469221e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.471533e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.471533e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.715424 sec + 1,360,172,621 cycles # 1.900 GHz + 2,159,744,323 instructions # 1.59 insn per cycle + 0.729957103 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982958280E-003 +Relative difference = 2.0044092642523172e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 4777b62c5f..684ca24c1f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,196 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-03_14:57:26 +DATE: 2024-03-01_03:18:37 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.842786e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.843194e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.843194e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.935145e-03 +- 4.929588e-03 ) GeV^-6 -TOTAL : 4.410058 sec - 15,108,619,628 cycles:u # 3.409 GHz (75.02%) - 2,813,974 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.99%) - 6,668,807 stalled-cycles-backend:u # 0.04% backend cycles idle (74.92%) - 12,277,562,361 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.92%) - 4.458755435 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.798857e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.800593e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.800593e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 +TOTAL : 1.598425 sec + 5,724,594,753 cycles # 3.063 GHz + 12,186,790,592 instructions # 2.13 insn per cycle + 1.928350107 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.356565e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.372551e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.372551e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.258769e+00 +- 1.256832e+00 ) GeV^-6 -TOTAL : 4.661126 sec - 16,000,894,811 cycles:u # 3.416 GHz (74.90%) - 3,664,056 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.88%) - 46,279,379 stalled-cycles-backend:u # 0.29% backend cycles idle (74.90%) - 12,965,129,778 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 4.705622209 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.285950e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.298387e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.298387e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 +TOTAL : 1.887489 sec + 6,620,617,732 cycles # 3.045 GHz + 14,303,245,528 instructions # 2.16 insn per cycle + 2.231962749 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.098781e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.098814e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.098814e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.806991 sec - 16,893,766,185 cycles:u # 3.498 GHz (74.99%) - 16,568,104 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.99%) - 1,867,774,628 stalled-cycles-backend:u # 11.06% backend cycles idle (74.99%) - 51,769,420,335 instructions:u # 3.06 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 4.831888692 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.094412e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.094687e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.094687e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.812831 sec + 17,931,583,834 cycles # 3.083 GHz + 53,588,775,363 instructions # 2.99 insn per cycle + 5.816760256 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087414119E-003 -Relative difference = 2.1196409216982896e-08 +Avg ME (F77/C++) = 9.8479612087541066E-003 +Relative difference = 2.1197698286506752e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.623898e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.624434e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.624434e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.143685 sec - 4,064,180,023 cycles:u # 3.486 GHz (74.69%) - 655,050 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.48%) - 415,271,617 stalled-cycles-backend:u # 10.22% backend cycles idle (74.82%) - 13,799,508,244 instructions:u # 3.40 insn per cycle - # 0.03 stalled cycles per insn (75.30%) - 1.169308254 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.573130e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.573569e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.573569e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.483014 sec + 4,585,157,051 cycles # 3.085 GHz + 13,762,636,955 instructions # 3.00 insn per cycle + 1.487033664 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546896527003E-003 +Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.044087e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.044350e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.044350e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.507458 sec - 1,819,280,916 cycles:u # 3.436 GHz (74.32%) - 555,295 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.43%) - 147,367,438 stalled-cycles-backend:u # 8.10% backend cycles idle (74.91%) - 4,855,416,131 instructions:u # 2.67 insn per cycle - # 0.03 stalled cycles per insn (75.56%) - 0.532829777 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.234993e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.236702e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.236702e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.734407 sec + 2,124,324,714 cycles # 2.880 GHz + 4,817,114,861 instructions # 2.27 insn per cycle + 0.738469635 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.746826e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.748881e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.748881e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.686036 sec + 1,868,608,359 cycles # 2.710 GHz + 4,274,464,507 instructions # 2.29 insn per cycle + 0.690085324 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161070551E-003 +Relative difference = 1.858823877057982e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.587479e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.589999e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.589999e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.701778 sec + 1,356,865,477 cycles # 1.924 GHz + 2,159,196,207 instructions # 1.59 insn per cycle + 0.705773287 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982958280E-003 +Relative difference = 2.0044092642523172e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 34e9c88382..2af18ad9d5 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-03_14:17:55 +DATE: 2024-03-01_02:40:36 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.874105e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.877606e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.877630e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.256794e-06 +- 4.775721e-07 ) GeV^-6 -TOTAL : 4.367192 sec - 14,910,823,313 cycles:u # 3.399 GHz (75.04%) - 2,801,107 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.06%) - 5,541,591 stalled-cycles-backend:u # 0.04% backend cycles idle (74.97%) - 12,185,699,999 instructions:u # 0.82 insn per cycle - # 0.00 stalled cycles per insn (74.91%) - 4.413931935 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.765595e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.766455e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.766757e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.693781 sec + 5,858,518,501 cycles # 3.029 GHz + 12,487,165,720 instructions # 2.13 insn per cycle + 2.044833380 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.363810e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.380339e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.380485e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.930014e-02 +- 1.363569e-02 ) GeV^-6 -TOTAL : 4.641177 sec - 15,913,862,306 cycles:u # 3.415 GHz (74.94%) - 2,911,751 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.94%) - 5,938,296 stalled-cycles-backend:u # 0.04% backend cycles idle (74.94%) - 12,907,264,996 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 4.684794214 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.312075e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.312852e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.312969e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.933893 sec + 6,737,061,424 cycles # 3.047 GHz + 14,801,104,127 instructions # 2.20 insn per cycle + 2.267780802 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.103979e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.104009e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.104009e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.784039 sec - 16,830,780,157 cycles:u # 3.502 GHz (74.95%) - 16,228,839 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.02%) - 1,752,024,514 stalled-cycles-backend:u # 10.41% backend cycles idle (75.03%) - 51,771,030,949 instructions:u # 3.08 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 4.808440063 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.922433e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.922702e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.922702e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.925615 sec + 17,989,215,363 cycles # 3.036 GHz + 53,579,777,630 instructions # 2.98 insn per cycle + 5.931642569 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087396841E-003 -Relative difference = 2.119623377106246e-08 +Avg ME (F77/C++) = 9.8479612087582491E-003 +Relative difference = 2.1198118933954545e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.594743e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.595274e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.595274e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.150569 sec - 4,073,477,842 cycles:u # 3.474 GHz (74.76%) - 700,890 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.76%) - 412,825,647 stalled-cycles-backend:u # 10.13% backend cycles idle (74.82%) - 13,787,072,091 instructions:u # 3.38 insn per cycle - # 0.03 stalled cycles per insn (75.16%) - 1.176252412 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.564689e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.565144e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.565144e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.490731 sec + 4,558,556,123 cycles # 3.055 GHz + 13,757,084,226 instructions # 3.02 insn per cycle + 1.501811120 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546896225560E-003 +Relative difference = 3.151694379513441e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.046827e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.047098e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.047098e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.505708 sec - 1,825,688,224 cycles:u # 3.459 GHz (74.31%) - 651,926 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.24%) - 162,825,670 stalled-cycles-backend:u # 8.92% backend cycles idle (74.49%) - 4,891,061,214 instructions:u # 2.68 insn per cycle - # 0.03 stalled cycles per insn (75.26%) - 0.531282824 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84775) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.177084e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.178836e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.178836e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.743943 sec + 2,139,817,263 cycles # 2.875 GHz + 4,819,936,629 instructions # 2.25 insn per cycle + 0.755587883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728161070967E-003 +Relative difference = 1.8588234562202478e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.229829e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.232369e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.232369e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.647666 sec + 1,869,906,105 cycles # 2.875 GHz + 4,276,791,956 instructions # 2.29 insn per cycle + 0.664053491 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728161070967E-003 +Relative difference = 1.8588234562202478e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.437378e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.439646e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.439646e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.718650 sec + 1,366,457,842 cycles # 1.901 GHz + 2,166,062,692 instructions # 1.59 insn per cycle + 0.731356674 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3474) (512y: 34) (512z:79492) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892981e-03 +Avg ME (F77/C++) = 9.8929811982955140E-003 +Relative difference = 2.0044060904369713e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index c232f7de62..c639834643 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-03_14:18:58 +DATE: 2024-03-01_02:41:23 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.742746e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.747735e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.747827e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.591909 sec - 33,228,560,743 cycles:u # 3.457 GHz (74.98%) - 3,697,395 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 8,571,213 stalled-cycles-backend:u # 0.03% backend cycles idle (75.01%) - 26,208,740,975 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 9.639434606 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.691286e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.691795e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.691928e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.198692 sec + 7,604,134,018 cycles # 3.054 GHz + 16,321,512,266 instructions # 2.15 insn per cycle + 2.594812497 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.300447e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.303532e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.303565e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 9.331694 sec - 32,295,291,917 cycles:u # 3.453 GHz (75.00%) - 3,442,980 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 6,323,865 stalled-cycles-backend:u # 0.02% backend cycles idle (74.98%) - 25,512,801,065 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 9.377328680 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.112457e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112776e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112803e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.397194 sec + 11,475,121,938 cycles # 3.084 GHz + 26,000,925,285 instructions # 2.27 insn per cycle + 3.777191130 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656533E-003 -Relative difference = 3.138524921691728e-07 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.028268e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.028295e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.028295e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.136405 sec - 18,072,585,799 cycles:u # 3.504 GHz (74.94%) - 24,128,528 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.01%) - 2,188,182,117 stalled-cycles-backend:u # 12.11% backend cycles idle (75.03%) - 55,394,719,073 instructions:u # 3.07 insn per cycle - # 0.04 stalled cycles per insn (75.03%) - 5.160896024 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.034566e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.034790e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.034790e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.578920 sec + 19,096,747,933 cycles # 2.903 GHz + 54,154,360,803 instructions # 2.84 insn per cycle + 6.585797711 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.356983e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.357122e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.357122e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.242013 sec - 7,911,518,876 cycles:u # 3.494 GHz (74.93%) - 1,746,084 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) - 847,402,982 stalled-cycles-backend:u # 10.71% backend cycles idle (74.92%) - 25,904,295,421 instructions:u # 3.27 insn per cycle - # 0.03 stalled cycles per insn (74.92%) - 2.267909527 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.634173e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.634271e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.634271e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.239396 sec + 9,369,032,238 cycles # 2.892 GHz + 26,160,172,444 instructions # 2.79 insn per cycle + 3.251135271 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.482063e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.482807e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.482807e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.964815 sec - 3,422,437,458 cycles:u # 3.468 GHz (74.87%) - 617,421 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.87%) - 268,378,370 stalled-cycles-backend:u # 7.84% backend cycles idle (74.88%) - 9,118,206,141 instructions:u # 2.66 insn per cycle - # 0.03 stalled cycles per insn (74.90%) - 0.990276746 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83802) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.697087e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.697545e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.697545e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.438333 sec + 4,079,178,507 cycles # 2.840 GHz + 9,228,646,226 instructions # 2.26 insn per cycle + 1.450605350 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.363646e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.364393e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.364393e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.218747 sec + 3,509,445,956 cycles # 2.879 GHz + 8,176,263,750 instructions # 2.33 insn per cycle + 1.230057623 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.850358e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.851005e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.851005e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.381042 sec + 2,620,845,167 cycles # 1.898 GHz + 4,155,618,865 instructions # 1.59 insn per cycle + 1.395419124 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index d509869f7f..ace04f97d7 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,185 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-03_14:20:40 +DATE: 2024-03-01_02:42:25 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.803581e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.809144e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.809214e+01 ) sec^-1 -MeanMatrixElemValue = ( 3.297256e-04 +- 2.011325e-04 ) GeV^-6 -TOTAL : 9.513476 sec - 32,910,544,812 cycles:u # 3.452 GHz (75.02%) - 3,566,297 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.05%) - 6,736,704 stalled-cycles-backend:u # 0.02% backend cycles idle (75.07%) - 26,011,552,309 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 9.562585571 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.691636e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.692217e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.692361e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.171682 sec + 7,616,890,265 cycles # 3.058 GHz + 16,356,089,453 instructions # 2.15 insn per cycle + 2.553555988 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:HIRDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.335565e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.338667e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.338703e+03 ) sec^-1 -MeanMatrixElemValue = ( 7.973353e-04 +- 5.853892e-04 ) GeV^-6 -TOTAL : 9.255352 sec - 32,027,497,769 cycles:u # 3.454 GHz (75.00%) - 3,680,692 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 6,546,956 stalled-cycles-backend:u # 0.02% backend cycles idle (74.98%) - 25,314,693,014 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (74.97%) - 9.300974978 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.106871e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107188e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107217e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.406322 sec + 11,260,210,288 cycles # 3.017 GHz + 25,906,087,343 instructions # 2.30 insn per cycle + 3.788413520 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656533E-003 -Relative difference = 3.138524921691728e-07 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.019957e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.019984e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.019984e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.178293 sec - 18,209,225,630 cycles:u # 3.502 GHz (74.93%) - 33,008,449 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.93%) - 2,203,899,310 stalled-cycles-backend:u # 12.10% backend cycles idle (74.96%) - 55,456,506,347 instructions:u # 3.05 insn per cycle - # 0.04 stalled cycles per insn (75.03%) - 5.202792696 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.951672e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.951882e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.951882e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.644473 sec + 19,262,229,911 cycles # 2.898 GHz + 54,152,472,780 instructions # 2.81 insn per cycle + 6.648593616 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.351772e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.351921e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.351921e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.246515 sec - 7,913,454,949 cycles:u # 3.488 GHz (74.97%) - 1,078,644 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 769,038,758 stalled-cycles-backend:u # 9.72% backend cycles idle (74.97%) - 25,807,143,781 instructions:u # 3.26 insn per cycle - # 0.03 stalled cycles per insn (74.97%) - 2.272377888 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.623003e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.623092e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623092e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.257928 sec + 9,349,757,536 cycles # 2.867 GHz + 26,077,919,393 instructions # 2.79 insn per cycle + 3.270643449 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.540730e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.541492e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.541492e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.954631 sec - 3,385,956,590 cycles:u # 3.466 GHz (74.62%) - 666,916 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.67%) - 307,339,922 stalled-cycles-backend:u # 9.08% backend cycles idle (75.08%) - 9,093,841,869 instructions:u # 2.69 insn per cycle - # 0.03 stalled cycles per insn (75.43%) - 0.980198317 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83360) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.760154e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.760626e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.760626e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.408906 sec + 4,059,558,991 cycles # 2.874 GHz + 9,213,876,384 instructions # 2.27 insn per cycle + 1.420092908 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.304001e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.304638e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.304638e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.231479 sec + 3,558,951,872 cycles # 2.881 GHz + 8,168,148,330 instructions # 2.30 insn per cycle + 1.241837128 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.836982e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.837574e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837574e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.381601 sec + 2,619,896,392 cycles # 1.892 GHz + 4,153,497,129 instructions # 1.59 insn per cycle + 1.390536918 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index be5aca464a..4f705cbffa 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-03_14:12:08 +DATE: 2024-03-01_02:35:57 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.695225e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.365990e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.743234e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.446213 sec + 1,972,017,701 cycles # 2.992 GHz + 2,778,256,208 instructions # 1.41 insn per cycle + 0.734930275 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 695,623,402 cycles:u # 2.059 GHz (77.12%) - 2,296,681 stalled-cycles-frontend:u # 0.33% frontend cycles idle (73.00%) - 5,536,743 stalled-cycles-backend:u # 0.80% backend cycles idle (69.67%) - 1,262,080,978 instructions:u # 1.81 insn per cycle - # 0.00 stalled cycles per insn (73.44%) - 0.384767862 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 916,816,301 cycles:u # 2.085 GHz (74.02%) - 2,147,831 stalled-cycles-frontend:u # 0.23% frontend cycles idle (72.75%) - 5,227,451 stalled-cycles-backend:u # 0.57% backend cycles idle (74.90%) - 1,320,642,522 instructions:u # 1.44 insn per cycle - # 0.00 stalled cycles per insn (77.11%) - 0.466448613 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.267244e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.134450e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.554945e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.528224 sec + 2,304,762,750 cycles # 3.008 GHz + 3,294,040,641 instructions # 1.43 insn per cycle + 0.823439197 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6911ee0) on address 0x14da00f7a000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14dc962e3dbf in ??? -#1 0x14dc962e3d2b in ??? -#2 0x14dc962e53e4 in ??? -#3 0x14dc8e7b6b64 in ??? -#4 0x14dc8e7b3b38 in ??? -#5 0x14dc8e771496 in ??? -#6 0x14dc9627d6e9 in ??? -#7 0x14dc963b149e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.178542e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.198431e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.198431e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.413844 sec - 4,996,522,481 cycles:u # 3.478 GHz (74.95%) - 2,383,415 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.95%) - 850,537,335 stalled-cycles-backend:u # 17.02% backend cycles idle (74.95%) - 13,818,020,085 instructions:u # 2.77 insn per cycle - # 0.06 stalled cycles per insn (74.96%) - 1.439637830 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.091452e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.114280e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.114280e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.522856 sec + 4,703,604,569 cycles # 3.081 GHz + 13,462,460,024 instructions # 2.86 insn per cycle + 1.529442917 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499484 +Relative difference = 5.286896509487005e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.951069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.025448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.025448e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.861454 sec + 2,622,516,081 cycles # 3.029 GHz + 7,553,226,055 instructions # 2.88 insn per cycle + 0.875162721 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499478 +Relative difference = 5.28689651338321e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.378326e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598362e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.506903 sec + 1,479,878,074 cycles # 2.896 GHz + 3,120,545,502 instructions # 2.11 insn per cycle + 0.521612120 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.763846e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.033394e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.033394e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.456990 sec + 1,342,026,946 cycles # 2.909 GHz + 2,982,806,139 instructions # 2.22 insn per cycle + 0.473253864 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x6466a0) on address 0x1511a565a000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.552530e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.674072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.674072e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.665523 sec + 1,326,336,546 cycles # 1.981 GHz + 1,954,248,677 instructions # 1.47 insn per cycle + 0.676015017 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 9949de34d4..7838899130 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,119 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-03_14:54:13 +DATE: 2024-03-01_03:15:54 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.566228e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.132243e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.132243e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.471075 sec + 2,051,009,542 cycles # 3.009 GHz + 3,055,349,974 instructions # 1.49 insn per cycle + 0.738770181 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 769,805,366 cycles:u # 2.194 GHz (75.09%) - 3,066,140 stalled-cycles-frontend:u # 0.40% frontend cycles idle (75.19%) - 22,207,789 stalled-cycles-backend:u # 2.88% backend cycles idle (75.94%) - 1,218,976,716 instructions:u # 1.58 insn per cycle - # 0.02 stalled cycles per insn (76.01%) - 0.377671175 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 3,001,509,066 cycles:u # 2.758 GHz (75.33%) - 30,405,025 stalled-cycles-frontend:u # 1.01% frontend cycles idle (75.18%) - 855,704,628 stalled-cycles-backend:u # 28.51% backend cycles idle (74.99%) - 3,253,779,151 instructions:u # 1.08 insn per cycle - # 0.26 stalled cycles per insn (74.65%) - 1.115519804 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.288005e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.253544e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.253544e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.748132 sec + 3,046,262,026 cycles # 3.023 GHz + 4,636,082,832 instructions # 1.52 insn per cycle + 1.065675268 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6911ee0) on address 0x14f2f231a000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14f587689dbf in ??? -#1 0x14f587689d2b in ??? -#2 0x14f58768b3e4 in ??? -#3 0x14f57fb5cb64 in ??? -#4 0x14f57fb59b38 in ??? -#5 0x14f57fb17496 in ??? -#6 0x14f5876236e9 in ??? -#7 0x14f58775749e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.180080e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.200126e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.200126e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.416017 sec - 4,992,511,017 cycles:u # 3.469 GHz (74.99%) - 1,981,947 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.99%) - 839,238,185 stalled-cycles-backend:u # 16.81% backend cycles idle (74.99%) - 13,804,114,722 instructions:u # 2.76 insn per cycle - # 0.06 stalled cycles per insn (75.00%) - 1.441472413 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.089966e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112868e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112868e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.529900 sec + 4,728,814,715 cycles # 3.083 GHz + 13,467,526,764 instructions # 2.85 insn per cycle + 1.534252544 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x6466a0) on address 0x1478f912a000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499484 +Relative difference = 5.286896509487005e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.949285e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.024056e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.024056e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.869004 sec + 2,652,875,861 cycles # 3.039 GHz + 7,602,145,003 instructions # 2.87 insn per cycle + 0.873736497 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499478 +Relative difference = 5.28689651338321e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.146841e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.351542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.351542e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.550316 sec + 1,514,222,662 cycles # 2.732 GHz + 3,170,467,422 instructions # 2.09 insn per cycle + 0.554802806 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.650572e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.918840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.918840e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.478096 sec + 1,374,122,120 cycles # 2.850 GHz + 3,032,631,270 instructions # 2.21 insn per cycle + 0.482825268 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.537453e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.662993e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.662993e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.675099 sec + 1,354,490,621 cycles # 1.996 GHz + 1,991,409,834 instructions # 1.47 insn per cycle + 0.679620955 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 8450f3c38f..1de3a7df55 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-03_14:12:22 +DATE: 2024-03-01_02:36:15 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.634258e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.200936e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.553712e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.443315 sec + 2,012,981,464 cycles # 3.013 GHz + 2,802,025,362 instructions # 1.39 insn per cycle + 0.744859677 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault - 726,970,759 cycles:u # 2.147 GHz (75.40%) - 2,277,890 stalled-cycles-frontend:u # 0.31% frontend cycles idle (74.87%) - 4,777,867 stalled-cycles-backend:u # 0.66% backend cycles idle (73.78%) - 1,295,834,284 instructions:u # 1.78 insn per cycle - # 0.00 stalled cycles per insn (70.60%) - 0.369418155 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault - 879,697,664 cycles:u # 2.004 GHz (77.05%) - 2,139,401 stalled-cycles-frontend:u # 0.24% frontend cycles idle (76.33%) - 5,059,191 stalled-cycles-backend:u # 0.58% backend cycles idle (76.36%) - 1,322,149,268 instructions:u # 1.50 insn per cycle - # 0.00 stalled cycles per insn (75.51%) - 0.482825554 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.239420e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.026633e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.428795e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.526694 sec + 2,300,725,267 cycles # 3.007 GHz + 3,244,738,845 instructions # 1.41 insn per cycle + 0.822736768 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6911e30) on address 0x154da8629000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x15503d999dbf in ??? -#1 0x15503d999d2b in ??? -#2 0x15503d99b3e4 in ??? -#3 0x155035e6cb64 in ??? -#4 0x155035e69b38 in ??? -#5 0x155035e27496 in ??? -#6 0x15503d9336e9 in ??? -#7 0x15503da6749e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.182610e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.202714e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.202714e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.408665 sec - 4,975,987,017 cycles:u # 3.477 GHz (74.85%) - 1,841,925 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.85%) - 659,857,967 stalled-cycles-backend:u # 13.26% backend cycles idle (74.85%) - 13,849,894,286 instructions:u # 2.78 insn per cycle - # 0.05 stalled cycles per insn (74.90%) - 1.433203022 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.093034e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.115683e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.115683e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.520645 sec + 4,710,102,553 cycles # 3.090 GHz + 13,456,334,828 instructions # 2.86 insn per cycle + 1.527404362 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499484 +Relative difference = 5.286896509487005e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.995699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.070809e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.070809e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.841713 sec + 2,618,818,041 cycles # 3.096 GHz + 7,552,217,415 instructions # 2.88 insn per cycle + 0.854217946 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499478 +Relative difference = 5.28689651338321e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.378534e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.594400e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.594400e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.506766 sec + 1,482,977,233 cycles # 2.909 GHz + 3,119,381,568 instructions # 2.10 insn per cycle + 0.519705447 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.757237e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.033602e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.033602e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.457488 sec + 1,337,095,985 cycles # 2.896 GHz + 2,979,946,273 instructions # 2.23 insn per cycle + 0.473330982 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x66d340) on address 0x14a1af479000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.547680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.672650e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.672650e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.666550 sec + 1,326,556,264 cycles # 1.978 GHz + 1,952,513,162 instructions # 1.47 insn per cycle + 0.681133765 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 9d0c590c91..4d40239a82 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-03_14:12:37 +DATE: 2024-03-01_02:36:34 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.367019e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.211392e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.351303e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.439896 sec + 1,919,384,660 cycles # 2.928 GHz + 2,652,462,812 instructions # 1.38 insn per cycle + 0.728915663 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 718,932,282 cycles:u # 2.158 GHz (73.06%) - 2,292,913 stalled-cycles-frontend:u # 0.32% frontend cycles idle (71.35%) - 5,485,037 stalled-cycles-backend:u # 0.76% backend cycles idle (73.12%) - 1,229,882,023 instructions:u # 1.71 insn per cycle - # 0.00 stalled cycles per insn (75.81%) - 0.361196020 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 778,058,565 cycles:u # 2.038 GHz (75.01%) - 2,296,658 stalled-cycles-frontend:u # 0.30% frontend cycles idle (72.63%) - 5,017,192 stalled-cycles-backend:u # 0.64% backend cycles idle (74.44%) - 1,258,146,385 instructions:u # 1.62 insn per cycle - # 0.00 stalled cycles per insn (76.74%) - 0.409848855 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.249516e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.812359e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.959123e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.476459 sec + 2,111,535,021 cycles # 3.010 GHz + 2,984,192,787 instructions # 1.41 insn per cycle + 0.759063881 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x690ff00) on address 0x1492f1885000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x149586befdbf in ??? -#1 0x149586befd2b in ??? -#2 0x149586bf13e4 in ??? -#3 0x14957f0c2b64 in ??? -#4 0x14957f0bfb38 in ??? -#5 0x14957f07d496 in ??? -#6 0x149586b896e9 in ??? -#7 0x149586cbd49e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.430784e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.461040e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.461040e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.166312 sec - 4,125,984,555 cycles:u # 3.473 GHz (75.09%) - 2,301,196 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.11%) - 315,086,834 stalled-cycles-backend:u # 7.64% backend cycles idle (75.11%) - 12,630,321,325 instructions:u # 3.06 insn per cycle - # 0.02 stalled cycles per insn (75.10%) - 1.190560148 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.158503e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184413e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.184413e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.434431 sec + 4,452,862,887 cycles # 3.097 GHz + 13,047,773,125 instructions # 2.93 insn per cycle + 1.440725517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246857540270419 +Relative difference = 1.7265064590569047e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.101216e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.298192e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.298192e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.547840 sec + 1,698,684,785 cycles # 3.077 GHz + 4,513,142,797 instructions # 2.66 insn per cycle + 0.560862800 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246859631675157 +Relative difference = 2.5853054135974944e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.089458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.856206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.856206e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.289099 sec + 853,788,001 cycles # 2.912 GHz + 1,897,231,072 instructions # 2.22 insn per cycle + 0.300313484 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.510175e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.400201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.400201e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.271830 sec + 801,479,133 cycles # 2.904 GHz + 1,820,357,988 instructions # 2.27 insn per cycle + 0.285846070 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x6242c0) on address 0x14ed3f78d000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.997156e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.506085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.506085e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.349567 sec + 731,841,700 cycles # 2.069 GHz + 1,305,336,291 instructions # 1.78 insn per cycle + 0.359850888 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489383243206 +Relative difference = 4.32888033512879e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 3b6b47fdec..441da29ffb 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,119 +1,241 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-03_14:54:28 +DATE: 2024-03-01_03:16:12 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.711602e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.109045e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109045e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 +TOTAL : 0.448633 sec + 2,014,530,108 cycles # 3.024 GHz + 2,953,646,670 instructions # 1.47 insn per cycle + 0.724573840 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 728,018,820 cycles:u # 2.102 GHz (74.62%) - 2,828,733 stalled-cycles-frontend:u # 0.39% frontend cycles idle (76.61%) - 37,815,477 stalled-cycles-backend:u # 5.19% backend cycles idle (77.98%) - 1,243,724,290 instructions:u # 1.71 insn per cycle - # 0.03 stalled cycles per insn (75.72%) - 0.373758611 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 2,913,829,385 cycles:u # 2.886 GHz (73.84%) - 30,498,351 stalled-cycles-frontend:u # 1.05% frontend cycles idle (74.73%) - 847,633,350 stalled-cycles-backend:u # 29.09% backend cycles idle (75.76%) - 3,091,197,240 instructions:u # 1.06 insn per cycle - # 0.27 stalled cycles per insn (75.41%) - 1.034770890 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.194631e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.629307e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.629307e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 +TOTAL : 0.616658 sec + 2,563,348,424 cycles # 3.027 GHz + 3,871,269,369 instructions # 1.51 insn per cycle + 0.904047137 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x690ff00) on address 0x14d87d5ed000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14db12952dbf in ??? -#1 0x14db12952d2b in ??? -#2 0x14db129543e4 in ??? -#3 0x14db0ae25b64 in ??? -#4 0x14db0ae22b38 in ??? -#5 0x14db0ade0496 in ??? -#6 0x14db128ec6e9 in ??? -#7 0x14db12a2049e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.433588e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.463885e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.463885e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.166373 sec - 4,145,866,751 cycles:u # 3.486 GHz (74.65%) - 2,407,100 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.92%) - 316,076,995 stalled-cycles-backend:u # 7.62% backend cycles idle (75.12%) - 12,640,812,453 instructions:u # 3.05 insn per cycle - # 0.03 stalled cycles per insn (75.12%) - 1.191670420 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.161555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188116e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188116e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.433803 sec + 4,469,694,345 cycles # 3.110 GHz + 13,052,094,019 instructions # 2.92 insn per cycle + 1.437926738 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x6242c0) on address 0x154ddd27d000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246857540270419 +Relative difference = 1.7265064590569047e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.090515e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.286507e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.286507e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.554057 sec + 1,716,801,013 cycles # 3.079 GHz + 4,560,314,564 instructions # 2.66 insn per cycle + 0.558193661 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246859631675157 +Relative difference = 2.5853054135974944e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.984424e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.738205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.738205e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.297621 sec + 872,015,724 cycles # 2.894 GHz + 1,933,356,220 instructions # 2.22 insn per cycle + 0.301984624 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.471182e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.343667e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.343667e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.276934 sec + 818,470,682 cycles # 2.917 GHz + 1,856,220,484 instructions # 2.27 insn per cycle + 0.281151541 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.926101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.412906e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.412906e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.358667 sec + 751,185,964 cycles # 2.073 GHz + 1,346,032,296 instructions # 1.79 insn per cycle + 0.362975431 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489383243206 +Relative difference = 4.32888033512879e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 9f2052d970..8918bec5c8 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-03_14:12:50 +DATE: 2024-03-01_02:36:50 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.307953e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201255e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336658e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.436130 sec + 1,959,442,257 cycles # 3.009 GHz + 2,743,667,126 instructions # 1.40 insn per cycle + 0.720037686 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault - 718,430,242 cycles:u # 2.167 GHz (75.24%) - 2,158,370 stalled-cycles-frontend:u # 0.30% frontend cycles idle (76.38%) - 5,048,832 stalled-cycles-backend:u # 0.70% backend cycles idle (74.59%) - 1,291,858,340 instructions:u # 1.80 insn per cycle - # 0.00 stalled cycles per insn (71.10%) - 0.365143940 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault - 806,118,878 cycles:u # 2.070 GHz (75.37%) - 2,154,811 stalled-cycles-frontend:u # 0.27% frontend cycles idle (75.52%) - 4,443,268 stalled-cycles-backend:u # 0.55% backend cycles idle (75.93%) - 1,240,070,438 instructions:u # 1.54 insn per cycle - # 0.00 stalled cycles per insn (76.95%) - 0.418220120 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.165076e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.782519e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922757e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.476114 sec + 2,116,952,174 cycles # 3.025 GHz + 3,000,364,507 instructions # 1.42 insn per cycle + 0.758577490 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x690fe50) on address 0x150d87cec000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x15101d054dbf in ??? -#1 0x15101d054d2b in ??? -#2 0x15101d0563e4 in ??? -#3 0x151015527b64 in ??? -#4 0x151015524b38 in ??? -#5 0x1510154e2496 in ??? -#6 0x15101cfee6e9 in ??? -#7 0x15101d12249e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.421570e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.451417e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.451417e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.173486 sec - 4,157,347,370 cycles:u # 3.477 GHz (74.63%) - 2,500,400 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.82%) - 625,232,992 stalled-cycles-backend:u # 15.04% backend cycles idle (75.10%) - 12,626,682,513 instructions:u # 3.04 insn per cycle - # 0.05 stalled cycles per insn (75.25%) - 1.197800077 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.155211e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.181167e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.438010 sec + 4,446,707,539 cycles # 3.084 GHz + 13,028,651,848 instructions # 2.93 insn per cycle + 1.444314220 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246857540270419 +Relative difference = 1.7265064590569047e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.098425e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.294299e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.294299e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.547784 sec + 1,696,823,876 cycles # 3.074 GHz + 4,509,092,353 instructions # 2.66 insn per cycle + 0.559046282 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246859631675157 +Relative difference = 2.5853054135974944e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.019219e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.763141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.763141e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.292180 sec + 859,590,330 cycles # 2.901 GHz + 1,893,994,453 instructions # 2.20 insn per cycle + 0.304986924 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.549494e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.438482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.438482e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.269638 sec + 798,515,936 cycles # 2.915 GHz + 1,816,168,831 instructions # 2.27 insn per cycle + 0.281600896 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x621ef0) on address 0x14eb4effc000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489318272599 +Relative difference = 4.784894739577799e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.914139e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.405725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.405725e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.355005 sec + 734,840,966 cycles # 2.046 GHz + 1,303,017,912 instructions # 1.77 insn per cycle + 0.365594980 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 32) (512z: 2383) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247489383243206 +Relative difference = 4.32888033512879e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 6681303993..9473075c44 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-03_14:13:04 +DATE: 2024-03-01_02:37:07 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.657865e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.342545e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.715127e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.444064 sec + 2,011,501,510 cycles # 2.996 GHz + 2,813,725,950 instructions # 1.40 insn per cycle + 0.745188123 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault - 680,746,734 cycles:u # 1.998 GHz (74.40%) - 2,192,175 stalled-cycles-frontend:u # 0.32% frontend cycles idle (70.65%) - 5,227,388 stalled-cycles-backend:u # 0.77% backend cycles idle (73.67%) - 1,163,912,158 instructions:u # 1.71 insn per cycle - # 0.00 stalled cycles per insn (77.36%) - 0.370007528 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault - 932,712,212 cycles:u # 2.128 GHz (72.21%) - 2,151,709 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.43%) - 4,978,401 stalled-cycles-backend:u # 0.53% backend cycles idle (77.14%) - 1,373,314,207 instructions:u # 1.47 insn per cycle - # 0.00 stalled cycles per insn (76.46%) - 0.466969143 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.264913e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.129230e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.558122e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.531362 sec + 2,289,898,203 cycles # 2.976 GHz + 3,193,334,828 instructions # 1.39 insn per cycle + 0.827090728 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6911ee0) on address 0x14cc9bf9a000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14cf312fedbf in ??? -#1 0x14cf312fed2b in ??? -#2 0x14cf313003e4 in ??? -#3 0x14cf297d1b64 in ??? -#4 0x14cf297ceb38 in ??? -#5 0x14cf2978c496 in ??? -#6 0x14cf312986e9 in ??? -#7 0x14cf313cc49e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.180041e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.200018e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.200018e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.411883 sec - 4,987,990,687 cycles:u # 3.478 GHz (74.90%) - 1,727,890 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.90%) - 898,353,870 stalled-cycles-backend:u # 18.01% backend cycles idle (74.90%) - 13,843,424,201 instructions:u # 2.78 insn per cycle - # 0.06 stalled cycles per insn (74.91%) - 1.436362073 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.087550e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.110443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110443e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.528426 sec + 4,733,772,591 cycles # 3.090 GHz + 13,465,129,433 instructions # 2.84 insn per cycle + 1.534888113 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.994397e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.071792e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.071792e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.843067 sec + 2,603,799,246 cycles # 3.073 GHz + 7,385,481,301 instructions # 2.84 insn per cycle + 0.853727039 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.410870e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.639370e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.639370e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.502006 sec + 1,465,753,503 cycles # 2.896 GHz + 3,056,435,528 instructions # 2.09 insn per cycle + 0.511483566 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.873726e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.164501e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.164501e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.444397 sec + 1,302,869,174 cycles # 2.905 GHz + 2,931,108,724 instructions # 2.25 insn per cycle + 0.456529729 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x6466a0) on address 0x14f37779a000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.488835e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.605728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.605728e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.681918 sec + 1,362,782,748 cycles # 1.986 GHz + 1,970,355,079 instructions # 1.45 insn per cycle + 0.693685126 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index f0f62cc1da..f04f8628ac 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,110 +1,224 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand -HASHIPRAND=hasHiprand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand_hasHiprand (USEBUILDDIR is set = 1) +HASCURAND=hasCurand +HASHIPRAND=hasNoHiprand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-03_14:13:18 +DATE: 2024-03-01_02:37:24 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.658641e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.216275e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.578681e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.445224 sec + 1,992,469,002 cycles # 2.992 GHz + 2,813,148,728 instructions # 1.41 insn per cycle + 0.736789901 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault - 705,255,530 cycles:u # 2.120 GHz (73.74%) - 2,090,693 stalled-cycles-frontend:u # 0.30% frontend cycles idle (76.82%) - 4,796,050 stalled-cycles-backend:u # 0.68% backend cycles idle (75.80%) - 1,191,491,523 instructions:u # 1.69 insn per cycle - # 0.00 stalled cycles per insn (76.03%) - 0.361944732 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault - 898,559,935 cycles:u # 2.067 GHz (74.78%) - 2,139,128 stalled-cycles-frontend:u # 0.24% frontend cycles idle (76.78%) - 5,276,888 stalled-cycles-backend:u # 0.59% backend cycles idle (76.21%) - 1,369,752,031 instructions:u # 1.52 insn per cycle - # 0.00 stalled cycles per insn (76.12%) - 0.464817968 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.263173e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.989199e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.385950e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.532147 sec + 2,297,521,664 cycles # 2.990 GHz + 3,210,517,070 instructions # 1.40 insn per cycle + 0.827894226 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6911e30) on address 0x14c05cea9000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14c2f221adbf in ??? -#1 0x14c2f221ad2b in ??? -#2 0x14c2f221c3e4 in ??? -#3 0x14c2ea6edb64 in ??? -#4 0x14c2ea6eab38 in ??? -#5 0x14c2ea6a8496 in ??? -#6 0x14c2f21b46e9 in ??? -#7 0x14c2f22e849e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use HiprandDevice, will use CommonRandom (as HiprandHost is not implemented yet) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.167118e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.186677e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.186677e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.427028 sec - 5,051,858,997 cycles:u # 3.486 GHz (74.65%) - 1,906,701 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.92%) - 961,495,185 stalled-cycles-backend:u # 19.03% backend cycles idle (75.16%) - 13,852,010,764 instructions:u # 2.74 insn per cycle - # 0.07 stalled cycles per insn (75.17%) - 1.451501973 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.091329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.113996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113996e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.523445 sec + 4,724,741,346 cycles # 3.094 GHz + 13,451,257,746 instructions # 2.85 insn per cycle + 1.529633779 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.010329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.087455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.087455e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.835617 sec + 2,595,186,002 cycles # 3.089 GHz + 7,389,201,553 instructions # 2.85 insn per cycle + 0.854907608 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.399802e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.624427e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.624427e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.503119 sec + 1,466,604,979 cycles # 2.890 GHz + 3,056,260,975 instructions # 2.08 insn per cycle + 0.515296062 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.762321e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.040429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.040429e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.457389 sec + 1,310,592,019 cycles # 2.838 GHz + 2,931,897,706 instructions # 2.24 insn per cycle + 0.469608344 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpu2/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x66d340) on address 0x15362ed29000. Reason: Unknown. +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.462138e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.577756e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.577756e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.689340 sec + 1,364,202,689 cycles # 1.967 GHz + 1,970,285,028 instructions # 1.44 insn per cycle + 0.699058633 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED From b0863f08dc77f15972bc361126afe9a4b225fd68 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 11:50:01 +0100 Subject: [PATCH 90/96] [susy2] rerun 78 tput tests on itgold91 for the first time, all ok - no GPU, but a Gold AVX512 CPU (Eventually the outputs of this test will be split between CPU and GPU...) --- .../log_eemumu_mad_d_inl0_hrd0.txt | 215 +++++++-------- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 223 +++++++--------- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 195 ++++++-------- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 190 +++++--------- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 217 +++++++-------- .../log_eemumu_mad_d_inl0_hrd1.txt | 215 +++++++-------- .../log_eemumu_mad_d_inl1_hrd0.txt | 215 +++++++-------- .../log_eemumu_mad_d_inl1_hrd1.txt | 215 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0.txt | 215 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 223 +++++++--------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 195 ++++++-------- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 190 +++++--------- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 217 +++++++-------- .../log_eemumu_mad_f_inl0_hrd1.txt | 215 +++++++-------- .../log_eemumu_mad_f_inl1_hrd0.txt | 215 +++++++-------- .../log_eemumu_mad_f_inl1_hrd1.txt | 215 +++++++-------- .../log_eemumu_mad_m_inl0_hrd0.txt | 215 +++++++-------- .../log_eemumu_mad_m_inl0_hrd1.txt | 215 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0.txt | 217 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 225 +++++++--------- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 197 ++++++-------- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 192 +++++--------- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 219 +++++++--------- .../log_ggtt_mad_d_inl0_hrd1.txt | 217 +++++++-------- .../log_ggtt_mad_d_inl1_hrd0.txt | 215 +++++++-------- .../log_ggtt_mad_d_inl1_hrd1.txt | 215 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0.txt | 217 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 225 +++++++--------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 197 ++++++-------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 192 +++++--------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 219 +++++++--------- .../log_ggtt_mad_f_inl0_hrd1.txt | 217 +++++++-------- .../log_ggtt_mad_f_inl1_hrd0.txt | 215 +++++++-------- .../log_ggtt_mad_f_inl1_hrd1.txt | 215 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0.txt | 217 +++++++-------- .../log_ggtt_mad_m_inl0_hrd1.txt | 217 +++++++-------- .../log_ggttg_mad_d_inl0_hrd0.txt | 231 +++++++--------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 243 +++++++---------- .../log_ggttg_mad_d_inl0_hrd1.txt | 231 +++++++--------- .../log_ggttg_mad_f_inl0_hrd0.txt | 231 +++++++--------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 243 +++++++---------- .../log_ggttg_mad_f_inl0_hrd1.txt | 231 +++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 233 +++++++---------- .../log_ggttg_mad_m_inl0_hrd1.txt | 233 +++++++---------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 231 +++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 243 +++++++---------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 211 ++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 206 +++++---------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 234 +++++++---------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 231 +++++++--------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 231 +++++++--------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 231 +++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 231 +++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 243 +++++++---------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 211 ++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 206 +++++---------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 234 +++++++---------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 237 +++++++---------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 235 +++++++---------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 235 +++++++---------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 231 +++++++--------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 231 +++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 231 +++++++--------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 243 +++++++---------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 231 +++++++--------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 235 +++++++---------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 247 +++++++----------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 235 +++++++---------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 231 +++++++--------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 231 +++++++--------- .../log_gqttq_mad_d_inl0_hrd0.txt | 231 +++++++--------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 243 +++++++---------- .../log_gqttq_mad_d_inl0_hrd1.txt | 231 +++++++--------- .../log_gqttq_mad_f_inl0_hrd0.txt | 231 +++++++--------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 243 +++++++---------- .../log_gqttq_mad_f_inl0_hrd1.txt | 231 +++++++--------- .../log_gqttq_mad_m_inl0_hrd0.txt | 231 +++++++--------- .../log_gqttq_mad_m_inl0_hrd1.txt | 231 +++++++--------- 78 files changed, 7289 insertions(+), 10083 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index baa8c044cd..ed28df1cad 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:23:52 +DATE: 2024-03-01_19:08:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.465816e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.330908e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.240172e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.907657 sec - 2,864,594,511 cycles # 3.017 GHz - 4,419,491,827 instructions # 1.54 insn per cycle - 1.243823060 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.117981e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.310106e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.310106e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.029383 sec - 18,345,746,310 cycles # 3.041 GHz - 43,971,705,846 instructions # 2.40 insn per cycle - 6.038464488 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.425993e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.645967e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.645967e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.035526 sec + 17,606,144,392 cycles # 3.495 GHz + 44,071,163,755 instructions # 2.50 insn per cycle + 5.038823878 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.673850e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.186329e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.186329e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.154865 sec - 12,823,382,487 cycles # 3.082 GHz - 30,998,172,347 instructions # 2.42 insn per cycle - 4.171623433 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.250731e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.893394e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.893394e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.401156 sec + 11,895,409,777 cycles # 3.495 GHz + 30,996,596,899 instructions # 2.61 insn per cycle + 3.404468118 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.086690e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.914110e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.914110e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.406763 sec - 10,081,289,557 cycles # 2.955 GHz - 19,366,111,959 instructions # 1.92 insn per cycle - 3.427414790 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.710645e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.676510e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.676510e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.897930 sec + 9,801,226,041 cycles # 3.380 GHz + 19,263,940,378 instructions # 1.97 insn per cycle + 2.901252745 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.191873e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.083636e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.083636e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.257696 sec - 9,685,682,355 cycles # 2.968 GHz - 18,976,171,527 instructions # 1.96 insn per cycle - 3.273948471 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.787050e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.808409e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.808409e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.849886 sec + 9,645,321,314 cycles # 3.382 GHz + 18,674,064,614 instructions # 1.94 insn per cycle + 2.853150797 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.805262e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.408203e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.408203e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.888242 sec - 8,621,851,062 cycles # 2.214 GHz - 15,727,334,662 instructions # 1.82 insn per cycle - 3.905958468 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.516175e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.268017e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.268017e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.378272 sec + 8,078,122,389 cycles # 3.393 GHz + 15,420,673,885 instructions # 1.91 insn per cycle + 2.381601605 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index b9ff72dbf3..062460fe0a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,219 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:12:58 +DATE: 2024-03-01_19:28:58 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.687342e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.551417e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.551417e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.232505 sec - 7,524,955,995 cycles # 3.041 GHz - 13,468,669,108 instructions # 1.79 insn per cycle - 2.532807464 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.081573e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.260544e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.260544e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.415532 sec - 19,561,606,037 cycles # 3.046 GHz - 44,198,639,919 instructions # 2.26 insn per cycle - 6.422457347 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.384713e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.591202e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.591202e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.251037 sec + 18,383,972,346 cycles # 3.499 GHz + 44,316,778,142 instructions # 2.41 insn per cycle + 5.255195321 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.552230e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.996603e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.996603e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.664054 sec - 13,997,557,946 cycles # 2.998 GHz - 31,841,279,233 instructions # 2.27 insn per cycle - 4.670791737 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.136391e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.710309e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.710309e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.710222 sec + 12,989,118,477 cycles # 3.498 GHz + 31,817,590,346 instructions # 2.45 insn per cycle + 3.714348648 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.951455e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.660973e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.660973e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.823801 sec - 11,324,833,068 cycles # 2.957 GHz - 20,724,775,427 instructions # 1.83 insn per cycle - 3.830534322 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.564211e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.413599e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.413599e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.183000 sec + 10,807,008,900 cycles # 3.392 GHz + 20,602,119,150 instructions # 1.91 insn per cycle + 3.187089748 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.028218e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.792747e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.792747e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.704930 sec - 10,963,593,820 cycles # 2.954 GHz - 20,347,072,159 instructions # 1.86 insn per cycle - 3.711957869 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.621053e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.507982e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.507982e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.143575 sec + 10,677,287,374 cycles # 3.394 GHz + 20,011,376,117 instructions # 1.87 insn per cycle + 3.147703877 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.747913e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.283053e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.283053e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.214412 sec - 9,956,996,891 cycles # 2.360 GHz - 16,873,658,319 instructions # 1.69 insn per cycle - 4.221168968 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.247983e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.684393e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.684393e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.688755 sec + 9,174,309,622 cycles # 3.408 GHz + 16,543,784,332 instructions # 1.80 insn per cycle + 2.692848189 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 09aaad1dd8..bec395b2fc 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:26:09 +DATE: 2024-03-01_19:34:42 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.492636e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.583078e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.097014e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.329039 sec - 4,626,136,964 cycles # 2.966 GHz - 7,229,705,832 instructions # 1.56 insn per cycle - 1.616136536 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.120496e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.314160e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.314160e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.400069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.612023e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.612023e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.368910 sec - 19,436,039,687 cycles # 3.050 GHz - 44,075,637,403 instructions # 2.27 insn per cycle - 6.374367735 seconds time elapsed +TOTAL : 5.119023 sec + 17,915,758,115 cycles # 3.498 GHz + 44,072,070,040 instructions # 2.46 insn per cycle + 5.122383130 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.684337e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.204179e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.204179e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.251714e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.897562e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.897562e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.477126 sec - 13,840,650,655 cycles # 3.088 GHz - 31,000,398,658 instructions # 2.24 insn per cycle - 4.482579907 seconds time elapsed +TOTAL : 3.399887 sec + 11,900,907,245 cycles # 3.498 GHz + 30,996,702,046 instructions # 2.60 insn per cycle + 3.403003584 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.074274e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.910197e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.910197e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.718441e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.683164e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.683164e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.779571 sec - 11,221,356,305 cycles # 2.967 GHz - 19,268,573,834 instructions # 1.72 insn per cycle - 3.784933241 seconds time elapsed +TOTAL : 2.889969 sec + 9,778,075,616 cycles # 3.381 GHz + 19,263,677,928 instructions # 1.97 insn per cycle + 2.893107114 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.174998e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.082449e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.082449e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.796994e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.822794e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.822794e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.643336 sec - 10,818,026,445 cycles # 2.966 GHz - 18,676,470,141 instructions # 1.73 insn per cycle - 3.648853496 seconds time elapsed +TOTAL : 2.839931 sec + 9,615,587,176 cycles # 3.383 GHz + 18,673,582,472 instructions # 1.94 insn per cycle + 2.843100077 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.875863e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.507498e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.507498e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.505324e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.277259e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.277259e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.111357 sec - 9,725,602,646 cycles # 2.364 GHz - 15,429,502,829 instructions # 1.59 insn per cycle - 4.116843302 seconds time elapsed +TOTAL : 2.384419 sec + 8,111,960,646 cycles # 3.399 GHz + 15,421,322,877 instructions # 1.90 insn per cycle + 2.387622919 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index c5fdf6f3c6..9bceb91dab 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,206 +1,138 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:22:56 +DATE: 2024-03-01_19:34:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.511929e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.606834e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.132028e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.970663 sec - 3,681,129,197 cycles # 3.043 GHz - 7,185,953,404 instructions # 1.95 insn per cycle - 1.266725293 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.129015e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.325606e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325606e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.971935 sec - 18,327,370,852 cycles # 3.067 GHz - 43,971,442,751 instructions # 2.40 insn per cycle - 5.977352348 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe: Aborted + 4,565,469 cycles # 3.250 GHz + 6,288,877 instructions # 1.38 insn per cycle + 0.042167323 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.658250e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.168305e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.168305e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.196892 sec - 12,732,971,160 cycles # 3.031 GHz - 30,998,026,084 instructions # 2.43 insn per cycle - 4.202372987 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe: Aborted + 4,579,291 cycles # 2.729 GHz + 6,314,135 instructions # 1.38 insn per cycle + 0.039024734 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.058430e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.883101e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.883101e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.446922 sec - 10,145,804,321 cycles # 2.940 GHz - 19,366,948,979 instructions # 1.91 insn per cycle - 3.452452971 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe: Aborted + 4,579,990 cycles # 2.736 GHz + 6,320,509 instructions # 1.38 insn per cycle + 0.038108900 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.138596e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.023243e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.023243e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.339653 sec - 9,693,126,342 cycles # 2.898 GHz - 18,976,550,822 instructions # 1.96 insn per cycle - 3.345442131 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe: Aborted + 4,556,853 cycles # 2.681 GHz + 6,314,123 instructions # 1.39 insn per cycle + 0.038394657 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.879529e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.506982e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.506982e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.741561 sec - 8,595,853,951 cycles # 2.295 GHz - 15,727,211,339 instructions # 1.83 insn per cycle - 3.747065146 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe: Aborted + 4,605,984 cycles # 2.673 GHz + 6,322,733 instructions # 1.37 insn per cycle + 0.038325628 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 4a4acadae4..dd5c05e2b0 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,208 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:19:38 +DATE: 2024-03-01_19:32:16 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.223584e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.552038e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.038459e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.841184 sec - 6,281,268,865 cycles # 3.032 GHz - 11,616,541,551 instructions # 1.85 insn per cycle - 2.127335919 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.136861e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.332827e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.332827e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.931254 sec - 18,320,874,631 cycles # 3.087 GHz - 43,971,483,251 instructions # 2.40 insn per cycle - 5.936943481 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.409868e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.625204e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.625204e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.085348 sec + 17,784,058,163 cycles # 3.496 GHz + 44,072,184,676 instructions # 2.48 insn per cycle + 5.088440281 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.678735e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.191487e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.191487e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.142725 sec - 12,747,370,194 cycles # 3.074 GHz - 30,997,666,885 instructions # 2.43 insn per cycle - 4.148307465 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.251897e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.896740e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.896740e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.399875 sec + 11,894,015,527 cycles # 3.496 GHz + 30,996,681,554 instructions # 2.61 insn per cycle + 3.403119350 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.080045e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.910176e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.910176e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.411600 sec - 10,085,079,136 cycles # 2.953 GHz - 19,364,558,625 instructions # 1.92 insn per cycle - 3.417084709 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.713038e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.676668e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.676668e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.893827 sec + 9,788,402,080 cycles # 3.380 GHz + 19,263,518,574 instructions # 1.97 insn per cycle + 2.897020495 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.138969e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.032835e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.032835e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.338836 sec - 9,731,023,917 cycles # 2.911 GHz - 18,988,816,377 instructions # 1.95 insn per cycle - 3.344328310 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.802736e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.835064e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.835064e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.835699 sec + 9,598,762,677 cycles # 3.382 GHz + 18,673,739,851 instructions # 1.95 insn per cycle + 2.838932444 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.865281e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.489559e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.489559e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.766791 sec - 8,586,243,314 cycles # 2.277 GHz - 15,726,194,960 instructions # 1.83 insn per cycle - 3.772300478 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.529044e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.287257e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.287257e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.372119 sec + 8,060,639,797 cycles # 3.395 GHz + 15,420,889,940 instructions # 1.91 insn per cycle + 2.375266541 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index acaec4a100..c765405708 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:24:28 +DATE: 2024-03-01_19:08:29 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.477749e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.322801e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.215924e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.699180 sec - 2,815,032,547 cycles # 3.020 GHz - 4,411,732,319 instructions # 1.57 insn per cycle - 1.012826906 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.177941e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.396494e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.396494e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.744811 sec - 17,454,360,700 cycles # 3.039 GHz - 41,822,159,126 instructions # 2.40 insn per cycle - 5.754685240 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.526914e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.781748e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.781748e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.743355 sec + 16,588,877,180 cycles # 3.495 GHz + 41,918,002,874 instructions # 2.53 insn per cycle + 4.746678875 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.724349e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.269291e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.269291e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.046627 sec - 12,493,235,601 cycles # 3.083 GHz - 30,160,547,265 instructions # 2.41 insn per cycle - 4.067076512 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.291295e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.968808e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.968808e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.352770 sec + 11,735,836,966 cycles # 3.498 GHz + 30,158,811,446 instructions # 2.57 insn per cycle + 3.356116759 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.121345e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.968992e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.968992e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.357760 sec - 9,927,136,910 cycles # 2.952 GHz - 19,096,793,241 instructions # 1.92 insn per cycle - 3.375474470 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.762119e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.774937e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.774937e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.859880 sec + 9,680,972,149 cycles # 3.382 GHz + 18,995,343,677 instructions # 1.96 insn per cycle + 2.863281821 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.204942e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.126738e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.126738e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.243150 sec - 9,616,213,299 cycles # 2.960 GHz - 18,757,748,925 instructions # 1.95 insn per cycle - 3.265371118 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.833992e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.898752e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.898752e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.816063 sec + 9,537,877,818 cycles # 3.384 GHz + 18,442,918,066 instructions # 1.93 insn per cycle + 2.819507820 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.914682e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.579340e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.579340e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.680994 sec - 8,464,459,891 cycles # 2.296 GHz - 15,603,182,673 instructions # 1.84 insn per cycle - 3.700542167 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.539315e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.305642e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.305642e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.365877 sec + 8,037,965,784 cycles # 3.394 GHz + 15,297,089,513 instructions # 1.90 insn per cycle + 2.369192143 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 5e36a6ad1c..a63eab528c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:02:07 +DATE: 2024-03-01_19:22:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.482201e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.589772e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.144008e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.677531 sec - 2,738,360,567 cycles # 3.010 GHz - 4,202,554,319 instructions # 1.53 insn per cycle - 0.971727419 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.697362e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.176157e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.176157e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.107132 sec - 12,669,493,888 cycles # 3.081 GHz - 32,513,570,576 instructions # 2.57 insn per cycle - 4.112837024 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.166985e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.720632e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.720632e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.530132 sec + 12,355,611,055 cycles # 3.498 GHz + 32,613,744,062 instructions # 2.64 insn per cycle + 3.533621370 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.109105e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.012747e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.012747e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.385880 sec - 10,259,128,837 cycles # 3.025 GHz - 24,473,597,991 instructions # 2.39 insn per cycle - 3.391687112 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.915578e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.095121e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.095121e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.764375 sec + 9,674,764,507 cycles # 3.497 GHz + 24,473,407,704 instructions # 2.53 insn per cycle + 2.767795138 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.263099e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.319180e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.319180e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.179158 sec - 9,139,183,085 cycles # 2.870 GHz - 16,922,980,195 instructions # 1.85 insn per cycle - 3.185130704 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.020535e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.265971e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.265971e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.659102 sec + 9,008,630,968 cycles # 3.385 GHz + 16,821,530,421 instructions # 1.87 insn per cycle + 2.662558205 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.177097e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.324804e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.324804e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.299126 sec - 9,225,486,663 cycles # 2.804 GHz - 16,350,529,622 instructions # 1.77 insn per cycle - 3.305119215 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.305206e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.841692e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.841692e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.493754 sec + 8,724,117,803 cycles # 3.495 GHz + 16,041,038,377 instructions # 1.84 insn per cycle + 2.497150137 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.061533e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.856351e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.856351e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.449960 sec - 7,914,148,444 cycles # 2.292 GHz - 14,582,993,732 instructions # 1.84 insn per cycle - 3.455623027 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.721147e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.731450e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.731450e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.278737 sec + 7,752,022,969 cycles # 3.398 GHz + 14,276,736,178 instructions # 1.84 insn per cycle + 2.282116900 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 640cde8efe..1680b26f09 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:02:37 +DATE: 2024-03-01_19:22:52 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.480008e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.624168e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.202092e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.676373 sec - 2,668,503,996 cycles # 2.929 GHz - 4,153,523,497 instructions # 1.56 insn per cycle - 0.971892133 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.254295e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.186891e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.186891e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.188433 sec - 9,833,021,244 cycles # 3.080 GHz - 25,393,539,961 instructions # 2.58 insn per cycle - 3.194101979 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.961846e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.105876e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.105876e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.753891 sec + 9,636,283,642 cycles # 3.496 GHz + 25,494,464,469 instructions # 2.65 insn per cycle + 2.757264718 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.515638e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.869932e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.869932e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.899703 sec - 8,920,893,128 cycles # 3.072 GHz - 21,482,466,118 instructions # 2.41 insn per cycle - 2.905533602 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.533991e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.440258e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.440258e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.386937 sec + 8,350,886,622 cycles # 3.495 GHz + 21,482,443,122 instructions # 2.57 insn per cycle + 2.390265855 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.523191e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.858970e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.858970e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.882396 sec - 8,595,793,495 cycles # 2.978 GHz - 15,810,706,009 instructions # 1.84 insn per cycle - 2.888136564 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.414671e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.103227e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.103227e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.417911 sec + 8,459,925,330 cycles # 3.495 GHz + 15,709,703,737 instructions # 1.86 insn per cycle + 2.421288962 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.508044e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.828642e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.828642e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.905551 sec - 8,435,887,633 cycles # 2.898 GHz - 15,503,428,881 instructions # 1.84 insn per cycle - 2.911395780 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.582488e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.453415e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.453415e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.347054 sec + 8,209,972,658 cycles # 3.494 GHz + 15,201,153,143 instructions # 1.85 insn per cycle + 2.350350106 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.236518e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.188285e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.188285e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.208349 sec - 7,562,205,797 cycles # 2.353 GHz - 14,282,233,625 instructions # 1.89 insn per cycle - 3.214128577 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.888127e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.125675e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.125675e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.206515 sec + 7,514,566,109 cycles # 3.402 GHz + 13,978,476,813 instructions # 1.86 insn per cycle + 2.209835018 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 4388b968c1..e23690073e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:25:01 +DATE: 2024-03-01_19:08:50 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.096246e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.080730e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.278086e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.584592 sec - 2,424,873,450 cycles # 2.992 GHz - 3,757,113,510 instructions # 1.55 insn per cycle - 0.891497126 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.144766e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.356973e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.356973e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.861200 sec - 17,835,681,737 cycles # 3.040 GHz - 43,512,863,183 instructions # 2.44 insn per cycle - 5.870178360 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.579769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.893874e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.893874e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 4.588717 sec + 16,042,603,900 cycles # 3.494 GHz + 43,689,952,449 instructions # 2.72 insn per cycle + 4.591858539 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.374028e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.640654e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.640654e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.010180 sec - 9,264,818,102 cycles # 3.072 GHz - 21,907,230,972 instructions # 2.36 insn per cycle - 3.030108679 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.164150e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.789236e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.789236e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.577598 sec + 9,012,166,111 cycles # 3.494 GHz + 21,985,206,002 instructions # 2.44 insn per cycle + 2.580747570 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.583102e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.970498e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.970498e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.786671 sec - 8,293,439,755 cycles # 2.970 GHz - 15,591,050,714 instructions # 1.88 insn per cycle - 2.803351674 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.413033e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.053155e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.053155e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.401302 sec + 8,147,553,270 cycles # 3.390 GHz + 15,499,004,047 instructions # 1.90 insn per cycle + 2.404474906 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.519812e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.882018e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.882018e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.857922 sec - 8,240,284,445 cycles # 2.878 GHz - 15,434,807,288 instructions # 1.87 insn per cycle - 2.873134335 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.441969e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.116891e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.116891e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.399786 sec + 8,145,366,561 cycles # 3.391 GHz + 15,141,676,425 instructions # 1.86 insn per cycle + 2.402973985 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.640401e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.080150e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.080150e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.738177 sec - 6,634,758,903 cycles # 2.418 GHz - 12,863,535,626 instructions # 1.94 insn per cycle - 2.752418443 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.532625e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.114438e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.114438e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 1.959359 sec + 6,688,058,045 cycles # 3.409 GHz + 12,568,020,375 instructions # 1.88 insn per cycle + 1.962540189 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 5ebf98d844..09e54302f1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,219 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:13:35 +DATE: 2024-03-01_19:29:20 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.291092e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.500878e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.500878e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.664885 sec - 5,743,008,286 cycles # 3.032 GHz - 10,353,112,228 instructions # 1.80 insn per cycle - 1.950710268 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.118079e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.318846e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.318846e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.094512 sec - 18,492,834,117 cycles # 3.035 GHz - 43,665,828,462 instructions # 2.36 insn per cycle - 6.100764200 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.560582e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.868731e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.868731e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 4.680582 sec + 16,368,713,672 cycles # 3.495 GHz + 43,840,413,084 instructions # 2.68 insn per cycle + 4.684353022 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.278046e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.410824e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.410824e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.242674 sec - 9,984,073,322 cycles # 3.074 GHz - 23,241,211,318 instructions # 2.33 insn per cycle - 3.248988906 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.038598e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.508158e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.508158e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.758988 sec + 9,660,848,077 cycles # 3.498 GHz + 23,320,516,277 instructions # 2.41 insn per cycle + 2.762808644 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.460715e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.687913e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.687913e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.031931 sec - 9,018,287,343 cycles # 2.969 GHz - 16,710,480,351 instructions # 1.85 insn per cycle - 3.038355322 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.263246e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.739456e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.739456e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.576121 sec + 8,764,669,029 cycles # 3.398 GHz + 16,619,915,020 instructions # 1.90 insn per cycle + 2.579987789 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.487042e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.742069e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.742069e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.003313 sec - 8,924,279,581 cycles # 2.966 GHz - 16,553,851,203 instructions # 1.85 insn per cycle - 3.009721457 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.291714e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.793688e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.793688e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.573938 sec + 8,759,700,956 cycles # 3.399 GHz + 16,262,157,562 instructions # 1.86 insn per cycle + 2.577692084 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.456097e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.675362e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.675362e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.047824 sec - 7,411,564,908 cycles # 2.428 GHz - 14,070,800,087 instructions # 1.90 insn per cycle - 3.054259465 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.243100e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.227413e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.227413e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.161827 sec + 7,403,968,795 cycles # 3.420 GHz + 13,776,638,645 instructions # 1.86 insn per cycle + 2.165624801 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 57f3a9eb6a..22febd8bf2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:26:45 +DATE: 2024-03-01_19:35:03 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.305418e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.176873e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.254170e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.176348 sec - 4,160,459,328 cycles # 2.977 GHz - 6,608,736,714 instructions # 1.59 insn per cycle - 1.454481545 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.163258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.379965e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.379965e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.580543e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.894945e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.894945e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.084905 sec - 18,848,150,042 cycles # 3.095 GHz - 43,694,410,467 instructions # 2.32 insn per cycle - 6.090122961 seconds time elapsed +TOTAL : 4.585144 sec + 16,033,044,830 cycles # 3.495 GHz + 43,690,102,128 instructions # 2.73 insn per cycle + 4.588130273 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.362188e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.607795e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.607795e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.165795e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.795351e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.795351e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.340145 sec - 10,237,006,523 cycles # 3.061 GHz - 21,987,992,116 instructions # 2.15 insn per cycle - 3.345494687 seconds time elapsed +TOTAL : 2.575915 sec + 9,005,888,202 cycles # 3.493 GHz + 21,985,016,188 instructions # 2.44 insn per cycle + 2.578895980 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.557177e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.937995e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.937995e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.416776e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.062032e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.062032e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.130033 sec - 9,276,164,079 cycles # 2.959 GHz - 15,501,530,354 instructions # 1.67 insn per cycle - 3.135291294 seconds time elapsed +TOTAL : 2.398556 sec + 8,138,328,552 cycles # 3.390 GHz + 15,499,199,008 instructions # 1.90 insn per cycle + 2.401542629 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.607828e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.022471e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.022471e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.447614e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.125962e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.125962e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.090209 sec - 9,218,829,691 cycles # 2.980 GHz - 15,143,949,757 instructions # 1.64 insn per cycle - 3.095551418 seconds time elapsed +TOTAL : 2.395743 sec + 8,131,915,937 cycles # 3.391 GHz + 15,135,872,991 instructions # 1.86 insn per cycle + 2.398760307 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.625698e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.049871e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.049871e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.538033e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.123539e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.123539e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.081111 sec - 7,633,670,846 cycles # 2.474 GHz - 12,572,894,419 instructions # 1.65 insn per cycle - 3.086406325 seconds time elapsed +TOTAL : 1.958851 sec + 6,688,513,152 cycles # 3.411 GHz + 12,568,494,145 instructions # 1.88 insn per cycle + 1.961860292 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 72f866059b..fc344cccba 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,206 +1,138 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:23:30 +DATE: 2024-03-01_19:34:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.312185e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.188856e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.293387e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.840959 sec - 3,233,651,545 cycles # 3.031 GHz - 6,593,293,750 instructions # 2.04 insn per cycle - 1.123835132 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.165423e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.380976e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.380976e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.753852 sec - 17,814,734,742 cycles # 3.094 GHz - 43,512,567,450 instructions # 2.44 insn per cycle - 5.759197636 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe: Aborted + 4,566,934 cycles # 3.256 GHz + 6,299,440 instructions # 1.38 insn per cycle + 0.038188510 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.367425e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.644557e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.644557e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.015821 sec - 9,302,641,553 cycles # 3.081 GHz - 21,907,397,717 instructions # 2.35 insn per cycle - 3.021054890 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe: Aborted + 4,511,989 cycles # 2.673 GHz + 6,385,849 instructions # 1.42 insn per cycle + 0.039615964 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.605570e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.994881e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.994881e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.763364 sec - 8,259,626,138 cycles # 2.984 GHz - 15,589,955,941 instructions # 1.89 insn per cycle - 2.768827600 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe: Aborted + 4,505,776 cycles # 2.678 GHz + 6,324,297 instructions # 1.40 insn per cycle + 0.039078577 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.581356e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.971929e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.971929e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.794808 sec - 8,189,932,997 cycles # 2.926 GHz - 15,434,468,382 instructions # 1.88 insn per cycle - 2.800117026 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe: Aborted + 4,566,500 cycles # 3.261 GHz + 6,322,428 instructions # 1.38 insn per cycle + 0.037934624 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.644746e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.098711e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.098711e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.731125 sec - 6,642,886,027 cycles # 2.429 GHz - 12,862,690,732 instructions # 1.94 insn per cycle - 2.736362886 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe: Aborted + 4,587,833 cycles # 3.231 GHz + 6,330,442 instructions # 1.38 insn per cycle + 0.038651242 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 8d8716bc9a..49d434f85c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,208 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:20:14 +DATE: 2024-03-01_19:32:37 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.282885e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.142631e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.141870e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.452761 sec - 5,067,036,613 cycles # 3.030 GHz - 9,262,361,364 instructions # 1.83 insn per cycle - 1.731002061 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.160324e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.375621e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.375621e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.780149 sec - 17,815,433,670 cycles # 3.080 GHz - 43,511,102,764 instructions # 2.44 insn per cycle - 5.785180938 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.581672e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.896680e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.896680e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 4.582644 sec + 16,029,325,021 cycles # 3.496 GHz + 43,690,424,802 instructions # 2.73 insn per cycle + 4.585651317 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.389771e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.650423e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.650423e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.992624 sec - 9,227,327,267 cycles # 3.079 GHz - 21,906,426,544 instructions # 2.37 insn per cycle - 2.997895192 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.149026e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.792646e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.792646e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.584919 sec + 9,045,554,644 cycles # 3.496 GHz + 21,985,410,240 instructions # 2.43 insn per cycle + 2.587906690 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.528530e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.865855e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.865855e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.845512 sec - 8,254,984,848 cycles # 2.896 GHz - 15,590,498,904 instructions # 1.89 insn per cycle - 2.850900280 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.414111e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.052877e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.052877e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.399307 sec + 8,139,048,725 cycles # 3.389 GHz + 15,499,397,322 instructions # 1.90 insn per cycle + 2.402310060 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.609279e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.018312e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.018312e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.764714 sec - 8,215,374,590 cycles # 2.969 GHz - 15,429,066,515 instructions # 1.88 insn per cycle - 2.770036927 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.448749e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.125723e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.125723e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.394319 sec + 8,124,895,108 cycles # 3.390 GHz + 15,136,296,147 instructions # 1.86 insn per cycle + 2.397299852 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.648656e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.090784e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.090784e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.731162 sec - 6,615,238,340 cycles # 2.419 GHz - 12,862,797,254 instructions # 1.94 insn per cycle - 2.736410000 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.541933e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.127583e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.127583e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 1.954962 sec + 6,675,259,521 cycles # 3.411 GHz + 12,568,022,737 instructions # 1.88 insn per cycle + 1.957882207 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index f9e4000e6d..4dc37a29ce 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:25:31 +DATE: 2024-03-01_19:09:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.096943e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095054e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.337200e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.581297 sec - 2,416,875,461 cycles # 3.000 GHz - 3,802,904,431 instructions # 1.57 insn per cycle - 0.886522859 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.237656e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.486670e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.486670e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.444566 sec - 16,726,225,777 cycles # 3.070 GHz - 41,270,625,621 instructions # 2.47 insn per cycle - 5.454849598 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.720434e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.100723e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.100723e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 4.265109 sec + 14,915,458,746 cycles # 3.495 GHz + 41,447,917,911 instructions # 2.78 insn per cycle + 4.268275411 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.460514e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.827007e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.827007e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.914617 sec - 8,996,783,237 cycles # 3.081 GHz - 21,210,998,059 instructions # 2.36 insn per cycle - 2.929493898 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.260922e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.026866e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.026866e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.519340 sec + 8,813,391,527 cycles # 3.495 GHz + 21,289,273,651 instructions # 2.42 insn per cycle + 2.522478814 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.611163e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.022551e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.022551e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.760181 sec - 8,249,336,928 cycles # 2.983 GHz - 15,425,238,678 instructions # 1.87 insn per cycle - 2.778856529 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.427524e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.128695e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.128695e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.394514 sec + 8,126,459,965 cycles # 3.390 GHz + 15,333,379,094 instructions # 1.89 insn per cycle + 2.397695798 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.587140e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.018405e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.018405e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.789811 sec - 8,096,556,575 cycles # 2.897 GHz - 15,238,891,903 instructions # 1.88 insn per cycle - 2.804859872 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.456500e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.144948e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.144948e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.391151 sec + 8,113,737,929 cycles # 3.390 GHz + 14,939,981,190 instructions # 1.84 insn per cycle + 2.394304414 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.644016e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.094854e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.094854e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.735992 sec - 6,623,617,660 cycles # 2.417 GHz - 12,843,079,376 instructions # 1.94 insn per cycle - 2.752411310 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.545530e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.144301e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.144301e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 1.956043 sec + 6,680,719,629 cycles # 3.411 GHz + 12,547,036,461 instructions # 1.88 insn per cycle + 1.959245995 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052564145764E-002 Relative difference = 1.9988585667912256e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index fde060de72..5416692647 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:03:05 +DATE: 2024-03-01_19:23:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.224284e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.181869e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.290244e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.576138 sec - 2,415,755,755 cycles # 3.001 GHz - 3,734,378,655 instructions # 1.55 insn per cycle - 0.864225849 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.727035e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.251286e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.251286e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.002640 sec - 12,159,409,273 cycles # 3.035 GHz - 32,432,694,101 instructions # 2.67 insn per cycle - 4.008158303 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.220532e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.908131e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.908131e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 3.440500 sec + 12,037,718,902 cycles # 3.497 GHz + 32,611,773,785 instructions # 2.71 insn per cycle + 3.443737801 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039840314887E-002 Relative difference = 1.244813035273009e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.805511e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.765564e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.765564e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.601867 sec - 7,999,882,010 cycles # 3.069 GHz - 18,656,600,340 instructions # 2.33 insn per cycle - 2.607493343 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.735307e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.227346e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.227346e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.275375 sec + 7,963,357,113 cycles # 3.496 GHz + 18,736,043,078 instructions # 2.35 insn per cycle + 2.278622500 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039283704129E-002 Relative difference = 5.583829420356249e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.939924e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.842069e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.842069e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.492780 sec - 7,427,313,914 cycles # 2.974 GHz - 14,251,086,474 instructions # 1.92 insn per cycle - 2.498394316 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.874885e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.138866e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.138866e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.180630 sec + 7,418,462,702 cycles # 3.398 GHz + 14,160,332,853 instructions # 1.91 insn per cycle + 2.183917049 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053244447801E-002 Relative difference = 2.5291823782248813e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.004272e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.034488e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.034488e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.444620 sec - 7,299,238,549 cycles # 2.980 GHz - 13,947,633,533 instructions # 1.91 insn per cycle - 2.450212772 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.068100e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.691798e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.691798e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.120331 sec + 7,409,259,501 cycles # 3.490 GHz + 13,649,785,865 instructions # 1.84 insn per cycle + 2.123584431 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053244447801E-002 Relative difference = 2.5291823782248813e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.706121e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.223606e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.223606e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.681955 sec - 6,492,318,128 cycles # 2.417 GHz - 13,422,094,611 instructions # 2.07 insn per cycle - 2.687432186 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.512909e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.081395e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.081395e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 1.966622 sec + 6,716,721,759 cycles # 3.411 GHz + 13,128,043,043 instructions # 1.95 insn per cycle + 1.969885224 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052562326775E-002 Relative difference = 1.997440588685788e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 0d6d3b3db1..ad73c3d757 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_03:03:32 +DATE: 2024-03-01_19:23:25 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.215876e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.204111e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.337047e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.576922 sec - 2,404,705,116 cycles # 2.985 GHz - 3,758,296,111 instructions # 1.56 insn per cycle - 0.864210592 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.296714e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.359904e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.359904e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.097656 sec - 9,472,450,742 cycles # 3.053 GHz - 25,268,175,697 instructions # 2.67 insn per cycle - 3.103042436 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.058716e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.506875e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.506875e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 2.666546 sec + 9,327,231,220 cycles # 3.496 GHz + 25,447,900,546 instructions # 2.73 insn per cycle + 2.669764195 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039838495897E-002 Relative difference = 1.2589928273811243e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.079795e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.704088e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.704088e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.415041 sec - 7,164,638,851 cycles # 2.961 GHz - 16,869,197,703 instructions # 2.35 insn per cycle - 2.420723497 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.323765e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.058451e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.058451e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.046425 sec + 7,160,329,051 cycles # 3.495 GHz + 16,947,102,157 instructions # 2.37 insn per cycle + 2.049696854 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.078168e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.319472e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.319472e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.394138 sec - 7,165,321,711 cycles # 2.987 GHz - 13,616,190,038 instructions # 1.90 insn per cycle - 2.399577311 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.297201e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.290316e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.290316e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.022038 sec + 7,066,919,623 cycles # 3.491 GHz + 13,525,128,890 instructions # 1.91 insn per cycle + 2.025234558 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053220800939E-002 Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.136069e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.411751e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.411751e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.363661 sec - 7,031,964,685 cycles # 2.970 GHz - 13,425,613,371 instructions # 1.91 insn per cycle - 2.369281481 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.389695e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.530798e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.530798e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.007329 sec + 7,015,071,750 cycles # 3.491 GHz + 13,133,210,306 instructions # 1.87 insn per cycle + 2.010465766 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053220800939E-002 Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.811199e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.477443e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.477443e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.592425 sec - 6,321,858,831 cycles # 2.434 GHz - 13,153,560,775 instructions # 2.08 insn per cycle - 2.597985755 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.659878e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.534461e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.534461e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 1.922796 sec + 6,566,422,271 cycles # 3.411 GHz + 12,859,737,400 instructions # 1.96 insn per cycle + 1.926087575 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052536860923E-002 Relative difference = 1.977588895209662e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 4be3e76490..837cf18a48 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:26:01 +DATE: 2024-03-01_19:09:26 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.449419e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.301374e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.190967e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.717219 sec - 2,841,227,385 cycles # 2.957 GHz - 4,430,504,412 instructions # 1.56 insn per cycle - 1.049815549 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.109294e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.297854e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.297854e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.069129 sec - 18,728,354,553 cycles # 3.083 GHz - 44,224,513,518 instructions # 2.36 insn per cycle - 6.079869673 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.377191e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.581422e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.581422e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.191610 sec + 18,159,288,198 cycles # 3.496 GHz + 44,323,801,591 instructions # 2.44 insn per cycle + 5.194897716 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.745615e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.315952e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.315952e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.001256 sec - 12,323,242,096 cycles # 3.075 GHz - 30,917,838,115 instructions # 2.51 insn per cycle - 4.017904894 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.293424e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.965077e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.965077e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.349210 sec + 11,716,680,806 cycles # 3.496 GHz + 30,916,285,925 instructions # 2.64 insn per cycle + 3.352547582 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.078908e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.902249e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.902249e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.416443 sec - 10,120,877,504 cycles # 2.958 GHz - 19,374,733,180 instructions # 1.91 insn per cycle - 3.431641491 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.736734e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.720058e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.720058e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.874813 sec + 9,725,993,159 cycles # 3.380 GHz + 19,272,349,716 instructions # 1.98 insn per cycle + 2.878123285 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.114347e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.979731e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.979731e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.374976 sec - 9,706,052,635 cycles # 2.871 GHz - 18,944,519,271 instructions # 1.95 insn per cycle - 3.395274500 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.837062e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.944737e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.944737e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.808701 sec + 9,507,947,404 cycles # 3.382 GHz + 18,641,360,147 instructions # 1.96 insn per cycle + 2.812001364 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.874531e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.524823e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.524823e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.760847 sec - 8,409,257,244 cycles # 2.233 GHz - 15,057,436,319 instructions # 1.79 insn per cycle - 3.776930410 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.604258e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.479437e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.479437e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.334946 sec + 7,938,052,698 cycles # 3.396 GHz + 14,749,724,015 instructions # 1.86 insn per cycle + 2.338250895 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 77001f8935..076a22f416 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_02:26:35 +DATE: 2024-03-01_19:09:47 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.443987e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.284127e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.143740e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.699538 sec - 2,805,342,043 cycles # 2.999 GHz - 4,414,010,673 instructions # 1.57 insn per cycle - 1.020206687 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.155620e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.358194e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.358194e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.837265 sec - 18,090,198,997 cycles # 3.097 GHz - 42,472,863,850 instructions # 2.35 insn per cycle - 5.848007644 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.451232e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.679621e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.679621e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.958423 sec + 17,349,182,496 cycles # 3.497 GHz + 42,572,483,064 instructions # 2.45 insn per cycle + 4.961760502 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.786116e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.385279e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.385279e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.920672 sec - 12,137,736,337 cycles # 3.092 GHz - 30,225,042,392 instructions # 2.49 insn per cycle - 3.938311189 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.362004e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.081863e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.081863e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.272026 sec + 11,447,774,045 cycles # 3.496 GHz + 30,223,060,667 instructions # 2.64 insn per cycle + 3.275374923 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.068049e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.882124e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.882124e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.437770 sec - 10,015,371,277 cycles # 2.909 GHz - 19,256,811,213 instructions # 1.92 insn per cycle - 3.454377757 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.762711e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.775192e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.775192e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.854465 sec + 9,656,821,540 cycles # 3.380 GHz + 19,154,909,313 instructions # 1.98 insn per cycle + 2.857812496 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.207913e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.137874e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.137874e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.235635 sec - 9,645,810,411 cycles # 2.976 GHz - 18,756,051,671 instructions # 1.94 insn per cycle - 3.251774736 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.912614e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.047841e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.047841e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.751170 sec + 9,315,104,434 cycles # 3.383 GHz + 18,442,592,455 instructions # 1.98 insn per cycle + 2.754516121 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.969792e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.680976e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.680976e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.592139 sec - 8,293,535,644 cycles # 2.305 GHz - 14,979,176,568 instructions # 1.81 insn per cycle - 3.613399615 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.659029e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.577201e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.577201e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.308570 sec + 7,854,338,821 cycles # 3.399 GHz + 14,673,429,400 instructions # 1.87 insn per cycle + 2.311927939 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 9a5df19d5b..38908c3fb1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:27:08 +DATE: 2024-03-01_19:10:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.025930e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.135524e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271935e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.535145 sec - 2,303,454,226 cycles # 2.990 GHz - 3,249,200,622 instructions # 1.41 insn per cycle - 0.848848936 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.185653e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.250591e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.250591e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.903669 sec - 15,175,795,116 cycles # 3.093 GHz - 38,374,949,840 instructions # 2.53 insn per cycle - 4.917105673 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.782462e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.856967e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.856967e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.897327 sec + 13,634,239,091 cycles # 3.496 GHz + 38,385,444,668 instructions # 2.82 insn per cycle + 3.900701537 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.662249e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.860778e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.860778e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.968890 sec - 9,101,848,873 cycles # 3.060 GHz - 24,578,505,710 instructions # 2.70 insn per cycle - 2.986159008 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.802582e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.042466e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.042466e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.309068 sec + 8,081,737,607 cycles # 3.496 GHz + 24,573,373,219 instructions # 3.04 insn per cycle + 2.312471054 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.728560e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.222175e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.222175e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.936093 sec - 5,474,671,571 cycles # 2.819 GHz - 11,252,385,098 instructions # 2.06 insn per cycle - 1.954008279 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.804161e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.442356e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.442356e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.464920 sec + 4,882,777,358 cycles # 3.328 GHz + 11,230,055,186 instructions # 2.30 insn per cycle + 1.468368837 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.292169e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.895497e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.895497e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.774092 sec - 4,972,729,611 cycles # 2.794 GHz - 10,557,445,760 instructions # 2.12 insn per cycle - 1.789622209 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.365690e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.101366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.101366e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.377099 sec + 4,595,020,426 cycles # 3.330 GHz + 10,504,303,399 instructions # 2.29 insn per cycle + 1.380458418 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.894024e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.109310e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.109310e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.799185 sec - 5,395,066,029 cycles # 1.924 GHz - 7,793,871,634 instructions # 1.44 insn per cycle - 2.817161041 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.158651e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.839395e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.839395e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.408011 sec + 4,692,884,605 cycles # 3.327 GHz + 7,735,219,640 instructions # 1.65 insn per cycle + 1.411361186 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 598396a8e7..3f9ce1ce83 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,219 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:14:07 +DATE: 2024-03-01_19:29:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.569533e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.877038e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.877038e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.801549 sec - 3,157,604,220 cycles # 3.025 GHz - 4,827,294,021 instructions # 1.53 insn per cycle - 1.101037847 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.171920e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.234476e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.234476e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.008942 sec - 15,497,351,856 cycles # 3.090 GHz - 38,433,512,801 instructions # 2.48 insn per cycle - 5.015755142 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.768798e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.843131e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.843131e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.955815 sec + 13,832,169,175 cycles # 3.494 GHz + 38,450,905,414 instructions # 2.78 insn per cycle + 3.959789848 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.610749e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.808660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.808660e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.090616 sec - 9,430,020,802 cycles # 3.049 GHz - 24,763,068,407 instructions # 2.63 insn per cycle - 3.097621879 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.744115e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.977846e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.977846e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.391627 sec + 8,365,963,654 cycles # 3.493 GHz + 24,752,915,009 instructions # 2.96 insn per cycle + 2.395600644 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.825746e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.328246e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.328246e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.984017 sec - 5,826,620,771 cycles # 2.928 GHz - 11,538,062,844 instructions # 1.98 insn per cycle - 1.990946794 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.710929e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.332597e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.332597e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.535175 sec + 5,133,184,909 cycles # 3.337 GHz + 11,512,737,529 instructions # 2.24 insn per cycle + 1.539209812 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.484023e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.101551e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.101551e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.799262 sec - 5,294,562,816 cycles # 2.933 GHz - 10,843,404,980 instructions # 2.05 insn per cycle - 1.806082483 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.182780e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.885497e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.885497e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.458979 sec + 4,882,301,300 cycles # 3.339 GHz + 10,787,066,129 instructions # 2.21 insn per cycle + 1.463011407 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.045937e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.276782e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.276782e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.778138 sec - 5,743,518,580 cycles # 2.063 GHz - 8,037,207,687 instructions # 1.40 insn per cycle - 2.785184310 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.015903e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.677662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.677662e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.490826 sec + 4,984,106,886 cycles # 3.336 GHz + 7,975,592,103 instructions # 1.60 insn per cycle + 1.494900312 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 977053e874..bbbf7c9fcf 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:27:17 +DATE: 2024-03-01_19:35:21 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.571348e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154956e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272098e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.617245 sec - 2,532,813,012 cycles # 2.999 GHz - 3,701,870,616 instructions # 1.46 insn per cycle - 0.904006340 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.183394e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.247420e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.247420e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.781344e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.856090e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.856090e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.966854 sec - 15,343,121,883 cycles # 3.087 GHz - 38,390,661,623 instructions # 2.50 insn per cycle - 4.972403311 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.899236 sec + 13,635,452,338 cycles # 3.495 GHz + 38,385,453,862 instructions # 2.82 insn per cycle + 3.902350302 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.599283e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.796561e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.796561e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.795094e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.033476e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.033476e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.079495 sec - 9,279,730,828 cycles # 3.010 GHz - 24,577,932,954 instructions # 2.65 insn per cycle - 3.085060857 seconds time elapsed +TOTAL : 2.312363 sec + 8,089,982,420 cycles # 3.495 GHz + 24,573,716,651 instructions # 3.04 insn per cycle + 2.315558026 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.908259e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.435116e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.435116e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.852044e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.495498e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.495498e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.937503 sec - 5,654,473,993 cycles # 2.911 GHz - 11,233,989,199 instructions # 1.99 insn per cycle - 1.943141738 seconds time elapsed +TOTAL : 1.456479 sec + 4,857,277,999 cycles # 3.329 GHz + 11,230,242,008 instructions # 2.31 insn per cycle + 1.459674299 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.578665e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.217153e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.217153e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.332356e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.056692e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.056692e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.757396 sec - 5,128,637,723 cycles # 2.910 GHz - 10,505,547,256 instructions # 2.05 insn per cycle - 1.762900213 seconds time elapsed +TOTAL : 1.381870 sec + 4,611,557,382 cycles # 3.331 GHz + 10,504,643,617 instructions # 2.28 insn per cycle + 1.385061393 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.070979e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.306684e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.306684e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.304542e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.010565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.010565e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.739915 sec - 5,558,468,681 cycles # 2.025 GHz - 7,741,606,815 instructions # 1.39 insn per cycle - 2.745378653 seconds time elapsed +TOTAL : 1.385516 sec + 4,620,699,813 cycles # 3.329 GHz + 7,735,038,876 instructions # 1.67 insn per cycle + 1.388698104 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 29a670398e..f421a264ca 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,206 +1,138 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:24:00 +DATE: 2024-03-01_19:34:14 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.579097e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155655e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270242e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.554530 sec - 2,358,271,315 cycles # 3.013 GHz - 3,682,090,929 instructions # 1.56 insn per cycle - 0.840283729 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.177843e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.241689e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.241689e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.919493 sec - 15,156,700,875 cycles # 3.078 GHz - 38,373,397,442 instructions # 2.53 insn per cycle - 4.925048190 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe: Aborted + 4,520,752 cycles # 3.261 GHz + 6,298,367 instructions # 1.39 insn per cycle + 0.037814907 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.588081e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.785746e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.785746e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.028765 sec - 9,114,596,397 cycles # 3.011 GHz - 24,581,732,536 instructions # 2.70 insn per cycle - 3.034354491 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe: Aborted + 4,515,960 cycles # 3.261 GHz + 6,261,612 instructions # 1.39 insn per cycle + 0.038098825 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.938829e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.476997e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.476997e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.869884 sec - 5,467,539,853 cycles # 2.917 GHz - 11,251,237,475 instructions # 2.06 insn per cycle - 1.875504692 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe: Aborted + 4,384,140 cycles # 2.671 GHz + 6,269,302 instructions # 1.43 insn per cycle + 0.039362811 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.273575e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.896545e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.896545e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.780053 sec - 4,944,261,583 cycles # 2.770 GHz - 10,558,833,446 instructions # 2.14 insn per cycle - 1.785881884 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe: Aborted + 4,828,863 cycles # 3.264 GHz + 6,315,171 instructions # 1.31 insn per cycle + 0.038710635 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.090701e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.328087e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.328087e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.667720 sec - 5,371,754,599 cycles # 2.010 GHz - 7,792,372,952 instructions # 1.45 insn per cycle - 2.673339648 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe: Aborted + 4,574,413 cycles # 2.717 GHz + 6,280,420 instructions # 1.37 insn per cycle + 0.038677314 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index e5cfc13b3e..0b4ae759d7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,208 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:20:45 +DATE: 2024-03-01_19:32:56 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.972409e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155179e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272541e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.697938 sec - 2,798,675,219 cycles # 3.021 GHz - 4,376,672,842 instructions # 1.56 insn per cycle - 0.983897382 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.189575e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.254386e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.254386e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.893907 sec - 15,162,024,600 cycles # 3.096 GHz - 38,372,989,497 instructions # 2.53 insn per cycle - 4.899450957 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.782456e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.857581e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.857581e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.898425 sec + 13,637,022,234 cycles # 3.496 GHz + 38,385,249,434 instructions # 2.81 insn per cycle + 3.901702317 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.704548e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.907149e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.907149e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.935182 sec - 9,091,941,153 cycles # 3.094 GHz - 24,577,519,112 instructions # 2.70 insn per cycle - 2.940777194 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.798533e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.038408e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.038408e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.311197 sec + 8,087,896,887 cycles # 3.496 GHz + 24,573,667,331 instructions # 3.04 insn per cycle + 2.314383849 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.938740e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.466662e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.466662e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.869802 sec - 5,458,289,042 cycles # 2.911 GHz - 11,250,961,339 instructions # 2.06 insn per cycle - 1.875881825 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.865253e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.517718e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.517718e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.455280 sec + 4,850,997,989 cycles # 3.328 GHz + 11,230,074,042 instructions # 2.32 insn per cycle + 1.458527044 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.493369e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.117845e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.117845e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.719311 sec - 5,034,836,824 cycles # 2.920 GHz - 10,558,271,294 instructions # 2.10 insn per cycle - 1.725057980 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.377231e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.116751e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.116751e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.375896 sec + 4,589,261,968 cycles # 3.330 GHz + 10,502,721,091 instructions # 2.29 insn per cycle + 1.379048846 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.013824e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.247297e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.247297e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.716839 sec - 5,403,556,568 cycles # 1.987 GHz - 7,794,191,095 instructions # 1.44 insn per cycle - 2.722528243 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.291081e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.994820e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.994820e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.387507 sec + 4,623,729,787 cycles # 3.327 GHz + 7,735,191,540 instructions # 1.67 insn per cycle + 1.390703747 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 73356b00dd..3c03138a4d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:27:35 +DATE: 2024-03-01_19:10:23 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.058566e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.139903e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277694e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.538743 sec - 2,297,794,086 cycles # 2.963 GHz - 3,276,125,304 instructions # 1.43 insn per cycle - 0.856267333 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.197217e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.262307e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.262307e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.877526 sec - 15,081,677,651 cycles # 3.089 GHz - 40,100,660,385 instructions # 2.66 insn per cycle - 4.889980594 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.710529e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.781570e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.781570e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.996408 sec + 13,979,413,584 cycles # 3.496 GHz + 40,196,760,217 instructions # 2.88 insn per cycle + 3.999779379 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 687) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.910252e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.135599e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.135599e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.787478 sec - 8,606,981,244 cycles # 3.082 GHz - 23,670,854,000 instructions # 2.75 insn per cycle - 2.801213189 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 5.077291e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.346738e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.346738e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.191848 sec + 7,669,887,605 cycles # 3.495 GHz + 23,666,455,211 instructions # 3.09 insn per cycle + 2.195224486 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.287623e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.696089e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.696089e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.088271 sec - 6,101,163,180 cycles # 2.915 GHz - 13,060,965,379 instructions # 2.14 insn per cycle - 2.110411764 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.056560e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.577761e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.577761e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.608065 sec + 5,356,503,720 cycles # 3.326 GHz + 13,039,165,916 instructions # 2.43 insn per cycle + 1.611516596 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.510708e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.955656e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.955656e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.007458 sec - 5,795,313,103 cycles # 2.878 GHz - 12,320,114,352 instructions # 2.13 insn per cycle - 2.035740422 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.286280e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.839274e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.839274e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.563652 sec + 5,212,654,568 cycles # 3.328 GHz + 12,266,825,956 instructions # 2.35 insn per cycle + 1.567147417 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.559784e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.746127e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.746127e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.054998 sec - 5,836,990,709 cycles # 1.908 GHz - 9,601,704,067 instructions # 1.64 insn per cycle - 3.069883688 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.906483e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.553389e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.553389e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.449150 sec + 4,824,305,914 cycles # 3.323 GHz + 9,542,657,586 instructions # 1.98 insn per cycle + 1.452571538 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 7ca7ca6f27..25ef1d5f3d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:03:58 +DATE: 2024-03-01_19:23:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.566149e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156976e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274435e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.520509 sec - 2,251,864,611 cycles # 2.979 GHz - 3,200,076,053 instructions # 1.42 insn per cycle - 0.813049887 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.538728e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.625778e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.625778e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.235724 sec - 13,018,811,907 cycles # 3.070 GHz - 34,384,492,801 instructions # 2.64 insn per cycle - 4.241723051 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.192021e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.290805e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.290805e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.415579 sec + 11,947,926,572 cycles # 3.495 GHz + 34,397,405,204 instructions # 2.88 insn per cycle + 3.419004365 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.065411e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.209741e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.209741e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.527791 sec - 10,618,068,276 cycles # 3.005 GHz - 24,006,297,751 instructions # 2.26 insn per cycle - 3.533644608 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.733586e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.879812e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.879812e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.934771 sec + 10,271,141,406 cycles # 3.497 GHz + 24,003,451,169 instructions # 2.34 insn per cycle + 2.938285168 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.845204e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.186466e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.186466e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.268558 sec - 6,594,099,256 cycles # 2.900 GHz - 12,400,446,525 instructions # 1.88 insn per cycle - 2.274329127 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 5.994183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.364715e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.364715e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.870369 sec + 6,224,279,915 cycles # 3.323 GHz + 12,380,406,144 instructions # 1.99 insn per cycle + 1.873883038 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516200 Relative difference = 3.2588037208240405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.148118e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.537652e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.537652e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.142175 sec - 6,250,159,272 cycles # 2.911 GHz - 11,574,474,977 instructions # 1.85 insn per cycle - 2.148019416 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.449558e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.878567e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.878567e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.751343 sec + 5,830,240,093 cycles # 3.324 GHz + 11,520,328,697 instructions # 1.98 insn per cycle + 1.754890337 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516200 Relative difference = 3.2588037208240405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.139590e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.381511e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.381511e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.637824 sec - 5,343,225,675 cycles # 2.022 GHz - 9,294,792,947 instructions # 1.74 insn per cycle - 2.643638198 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.956146e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.616868e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.616868e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.441128 sec + 4,799,320,936 cycles # 3.324 GHz + 9,238,083,496 instructions # 1.92 insn per cycle + 1.444627570 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 6740b658ab..94cf15e1a9 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:04:25 +DATE: 2024-03-01_19:23:56 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.563128e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.158314e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275634e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.525125 sec - 2,266,508,632 cycles # 2.999 GHz - 3,227,683,893 instructions # 1.42 insn per cycle - 0.815560561 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.686393e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.784184e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.784184e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.008193 sec - 12,350,315,150 cycles # 3.077 GHz - 35,037,181,267 instructions # 2.84 insn per cycle - 4.014100641 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.402676e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.515698e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.515698e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.211500 sec + 11,235,460,343 cycles # 3.496 GHz + 35,050,056,722 instructions # 3.12 insn per cycle + 3.214868359 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.126314e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.271590e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.271590e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.458899 sec - 10,688,048,117 cycles # 3.085 GHz - 23,082,662,787 instructions # 2.16 insn per cycle - 3.464737128 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.645770e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.783620e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.783620e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.003938 sec + 10,512,340,009 cycles # 3.496 GHz + 23,080,735,378 instructions # 2.20 insn per cycle + 3.007803609 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.065386e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.447820e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.447820e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.175532 sec - 6,167,789,524 cycles # 2.829 GHz - 11,956,365,830 instructions # 1.94 insn per cycle - 2.181490352 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.439627e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.869125e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.869125e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.750278 sec + 5,827,983,221 cycles # 3.325 GHz + 11,935,795,493 instructions # 2.05 insn per cycle + 1.753728973 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.355284e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.776167e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.776167e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.062589 sec - 6,012,687,929 cycles # 2.908 GHz - 11,129,506,913 instructions # 1.85 insn per cycle - 2.068524285 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.801848e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.279461e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.279461e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.666561 sec + 5,551,706,657 cycles # 3.326 GHz + 11,073,468,618 instructions # 1.99 insn per cycle + 1.670062494 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.234665e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.489644e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.489644e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.581777 sec - 5,215,223,845 cycles # 2.016 GHz - 9,019,923,506 instructions # 1.73 insn per cycle - 2.587755549 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.282360e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.994130e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.994130e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.389411 sec + 4,628,373,888 cycles # 3.325 GHz + 8,962,575,488 instructions # 1.94 insn per cycle + 1.392904800 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 3164378b7a..a7acc63d87 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:28:04 +DATE: 2024-03-01_19:10:38 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.210726e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.585567e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.966482e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.485254 sec - 2,068,141,298 cycles # 2.904 GHz - 2,916,142,359 instructions # 1.41 insn per cycle - 0.784434250 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.313091e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.389644e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.389644e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.621612 sec - 14,026,409,554 cycles # 3.032 GHz - 38,341,238,705 instructions # 2.73 insn per cycle - 4.632085783 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.061036e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.162907e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.162907e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.542085 sec + 12,391,424,054 cycles # 3.496 GHz + 38,265,921,403 instructions # 3.09 insn per cycle + 3.545513591 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.217740e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.647077e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.647077e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.094155 sec - 6,477,656,873 cycles # 3.085 GHz - 15,815,714,256 instructions # 2.44 insn per cycle - 2.109661469 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.565544e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.063647e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.063647e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 1.710778 sec + 5,991,801,017 cycles # 3.497 GHz + 15,824,708,814 instructions # 2.64 insn per cycle + 1.714001105 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.558089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.098648e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.098648e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.180439 sec - 3,464,791,228 cycles # 2.924 GHz - 7,594,553,534 instructions # 2.19 insn per cycle - 1.196926932 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.241504e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.408345e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.408345e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 0.952958 sec + 3,189,843,324 cycles # 3.339 GHz + 7,575,039,950 instructions # 2.37 insn per cycle + 0.956152471 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.028669e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.195924e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.195924e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.103361 sec - 3,253,544,502 cycles # 2.935 GHz - 7,202,500,133 instructions # 2.21 insn per cycle - 1.115792553 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.319510e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.510809e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.510809e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 0.906095 sec + 3,034,230,865 cycles # 3.340 GHz + 7,150,811,632 instructions # 2.36 insn per cycle + 0.909351723 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.586127e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.450667e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.450667e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.467307 sec - 3,062,229,633 cycles # 2.079 GHz - 5,834,823,887 instructions # 1.91 insn per cycle - 1.480044473 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.446120e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.684000e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.684000e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 0.836297 sec + 2,802,120,149 cycles # 3.341 GHz + 5,780,771,468 instructions # 2.06 insn per cycle + 0.839556975 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index b32abcb3fe..244f7ed452 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,219 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:14:35 +DATE: 2024-03-01_19:29:55 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.139226e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.486374e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.486374e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.665285 sec - 2,679,931,908 cycles # 3.001 GHz - 4,173,181,221 instructions # 1.56 insn per cycle - 0.950193790 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.339175e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.415593e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.415593e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.608146 sec - 14,198,803,048 cycles # 3.078 GHz - 38,383,841,480 instructions # 2.70 insn per cycle - 4.614561058 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.053680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.154942e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.154942e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.570943 sec + 12,491,119,184 cycles # 3.495 GHz + 38,309,264,755 instructions # 3.07 insn per cycle + 3.574697502 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.150361e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.574288e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.574288e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.164951 sec - 6,682,648,138 cycles # 3.079 GHz - 16,095,511,662 instructions # 2.41 insn per cycle - 2.171478460 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.494756e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.984731e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.984731e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 1.760397 sec + 6,151,072,141 cycles # 3.488 GHz + 16,106,098,708 instructions # 2.62 insn per cycle + 1.764265698 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.377335e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.075060e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.075060e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.245724 sec - 3,655,872,382 cycles # 2.921 GHz - 7,830,960,228 instructions # 2.14 insn per cycle - 1.252058919 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.220386e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.382151e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.382151e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 0.999164 sec + 3,353,390,197 cycles # 3.346 GHz + 7,813,166,304 instructions # 2.33 insn per cycle + 1.003041613 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.884024e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.146718e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.146718e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.193275 sec - 3,439,455,837 cycles # 2.869 GHz - 7,440,735,686 instructions # 2.16 insn per cycle - 1.199824293 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.301043e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.484531e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.484531e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 0.948601 sec + 3,185,245,475 cycles # 3.348 GHz + 7,388,936,707 instructions # 2.32 insn per cycle + 0.952384757 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.445766e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.274506e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.274506e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.539244 sec - 3,276,504,779 cycles # 2.121 GHz - 6,089,433,455 instructions # 1.86 insn per cycle - 1.545785864 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.412343e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.637273e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.637273e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 0.886349 sec + 2,975,186,907 cycles # 3.345 GHz + 6,036,018,349 instructions # 2.03 insn per cycle + 0.890113209 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 1418229a2f..d4cdb5a44f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:27:44 +DATE: 2024-03-01_19:35:36 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.472574e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.636713e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.962164e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.558880 sec - 2,364,095,478 cycles # 3.003 GHz - 3,484,344,192 instructions # 1.47 insn per cycle - 0.845198156 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.358072e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.436073e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.436073e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.058975e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.161124e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.161124e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.585598 sec - 14,172,267,813 cycles # 3.088 GHz - 38,370,669,897 instructions # 2.71 insn per cycle - 4.590984697 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.544825 sec + 12,394,762,036 cycles # 3.494 GHz + 38,265,645,755 instructions # 3.09 insn per cycle + 3.547916257 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.211957e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.640936e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.640936e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.488571e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.974371e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.974371e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.148796 sec - 6,634,619,629 cycles # 3.081 GHz - 15,827,825,218 instructions # 2.39 insn per cycle - 2.154083020 seconds time elapsed +TOTAL : 1.729685 sec + 6,044,983,083 cycles # 3.490 GHz + 15,825,361,332 instructions # 2.62 insn per cycle + 1.732686150 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.547921e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095970e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.095970e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.240146e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.410041e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.410041e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.236002 sec - 3,624,228,310 cycles # 2.921 GHz - 7,577,923,207 instructions # 2.09 insn per cycle - 1.241371528 seconds time elapsed +TOTAL : 0.954496 sec + 3,193,272,434 cycles # 3.337 GHz + 7,575,303,386 instructions # 2.37 insn per cycle + 0.957550268 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.019099e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183109e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183109e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.319772e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.511858e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.511858e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.166800 sec - 3,412,475,771 cycles # 2.913 GHz - 7,154,107,852 instructions # 2.10 insn per cycle - 1.172143118 seconds time elapsed +TOTAL : 0.905312 sec + 3,033,184,068 cycles # 3.342 GHz + 7,150,791,405 instructions # 2.36 insn per cycle + 0.908396432 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.590832e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.447342e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.447342e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.446800e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.685435e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.685435e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.519807 sec - 3,228,336,001 cycles # 2.118 GHz - 5,784,936,071 instructions # 1.79 insn per cycle - 1.525231071 seconds time elapsed +TOTAL : 0.835810 sec + 2,801,569,225 cycles # 3.343 GHz + 5,780,758,142 instructions # 2.06 insn per cycle + 0.838834457 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 6cc1ea482a..54a85a955c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,206 +1,138 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:24:27 +DATE: 2024-03-01_19:34:19 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.444388e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.637591e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.958095e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.506234 sec - 2,151,061,698 cycles # 2.964 GHz - 3,317,932,316 instructions # 1.54 insn per cycle - 0.783859096 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.348187e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.425786e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.425786e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.550945 sec - 14,020,959,724 cycles # 3.078 GHz - 38,340,893,799 instructions # 2.73 insn per cycle - 4.556370309 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe: Aborted + 4,538,068 cycles # 3.245 GHz + 6,262,611 instructions # 1.38 insn per cycle + 0.037803610 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.084306e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.497288e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.497288e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.146528 sec - 6,470,246,026 cycles # 3.008 GHz - 15,815,477,798 instructions # 2.44 insn per cycle - 2.151761392 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe: Aborted + 4,534,363 cycles # 3.253 GHz + 6,314,690 instructions # 1.39 insn per cycle + 0.037873298 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.654131e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.108425e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.108425e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.168173 sec - 3,446,745,579 cycles # 2.939 GHz - 7,593,552,481 instructions # 2.20 insn per cycle - 1.173417445 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe: Aborted + 4,637,775 cycles # 3.255 GHz + 6,328,952 instructions # 1.36 insn per cycle + 0.038380049 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.035097e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.201064e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.201064e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.096134 sec - 3,246,063,667 cycles # 2.949 GHz - 7,201,559,823 instructions # 2.22 insn per cycle - 1.101526557 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe: Aborted + 4,573,961 cycles # 3.257 GHz + 6,321,016 instructions # 1.38 insn per cycle + 0.037677949 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.601752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.455480e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.455480e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.463312 sec - 3,061,733,109 cycles # 2.086 GHz - 5,833,735,363 instructions # 1.91 insn per cycle - 1.468683964 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe: Aborted + 4,502,378 cycles # 3.260 GHz + 6,322,124 instructions # 1.40 insn per cycle + 0.037792372 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index d1c301e36a..c1a03aebbd 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,208 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:21:13 +DATE: 2024-03-01_19:33:11 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.521212e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.620937e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.942141e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.625420 sec - 2,414,961,393 cycles # 2.854 GHz - 3,791,061,685 instructions # 1.57 insn per cycle - 0.904442863 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.328946e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.404018e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.404018e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.586154 sec - 14,183,213,679 cycles # 3.090 GHz - 38,341,040,102 instructions # 2.70 insn per cycle - 4.591510537 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.061154e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.162932e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.162932e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.542051 sec + 12,388,794,866 cycles # 3.495 GHz + 38,264,992,582 instructions # 3.09 insn per cycle + 3.545107831 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.242078e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.670922e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.670922e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.084805 sec - 6,467,654,599 cycles # 3.095 GHz - 15,814,952,627 instructions # 2.45 insn per cycle - 2.090234852 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.542448e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.037442e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.037442e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 1.716038 sec + 6,003,936,148 cycles # 3.494 GHz + 15,824,627,159 instructions # 2.64 insn per cycle + 1.719029866 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.553311e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096092e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.096092e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.181028 sec - 3,453,301,700 cycles # 2.913 GHz - 7,593,575,205 instructions # 2.20 insn per cycle - 1.186225517 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.242773e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.410199e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.410199e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 0.952242 sec + 3,187,363,454 cycles # 3.339 GHz + 7,575,422,549 instructions # 2.38 insn per cycle + 0.955261247 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.023252e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.188398e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.188398e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.108864 sec - 3,247,038,827 cycles # 2.916 GHz - 7,202,168,264 instructions # 2.22 insn per cycle - 1.114391762 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.320310e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.510744e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.510744e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 0.905497 sec + 3,033,662,646 cycles # 3.342 GHz + 7,150,728,059 instructions # 2.36 insn per cycle + 0.908547463 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.596256e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.449431e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.449431e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.464294 sec - 3,059,603,183 cycles # 2.083 GHz - 5,833,854,527 instructions # 1.91 insn per cycle - 1.469681735 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.448049e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.685618e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.685618e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 0.835256 sec + 2,796,643,852 cycles # 3.341 GHz + 5,780,733,558 instructions # 2.07 insn per cycle + 0.838315041 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index adc2ed2114..8b91db0e17 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:28:27 +DATE: 2024-03-01_19:10:51 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.323457e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.629602e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.019308e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.480923 sec - 2,116,431,851 cycles # 3.003 GHz - 3,022,655,895 instructions # 1.43 insn per cycle - 0.777218279 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.299655e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.373045e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.373045e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.644587 sec - 14,360,257,758 cycles # 3.089 GHz - 39,833,716,550 instructions # 2.77 insn per cycle - 4.652300252 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.113653e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.221073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.221073e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.483745 sec + 12,191,233,427 cycles # 3.497 GHz + 39,807,639,093 instructions # 3.27 insn per cycle + 3.486969433 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 580) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199028000236 Relative difference = 4.790961076489297e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.819246e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.374211e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.374211e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.888755 sec - 5,601,188,109 cycles # 2.957 GHz - 15,285,931,975 instructions # 2.73 insn per cycle - 1.901754882 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.134346e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.923260e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.923260e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 1.402390 sec + 4,912,248,574 cycles # 3.497 GHz + 15,294,202,423 instructions # 3.11 insn per cycle + 1.405648629 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.809980e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.511061e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.511061e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.623137 sec - 4,755,173,593 cycles # 2.919 GHz - 9,735,141,159 instructions # 2.05 insn per cycle - 1.639641207 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.083568e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.948403e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.948403e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.262664 sec + 4,211,226,733 cycles # 3.329 GHz + 9,715,508,285 instructions # 2.31 insn per cycle + 1.265920918 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182108197361 Relative difference = 1.0391259163456515e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.976796e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.708401e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.708401e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.586631 sec - 4,632,931,570 cycles # 2.912 GHz - 9,326,747,974 instructions # 2.01 insn per cycle - 1.599475417 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.245949e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.013771e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.013771e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.245932 sec + 4,154,573,718 cycles # 3.329 GHz + 9,273,579,239 instructions # 2.23 insn per cycle + 1.249125651 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182108197361 Relative difference = 1.0391259163456515e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.246902e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.812329e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.812329e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.762945 sec - 3,668,593,409 cycles # 2.074 GHz - 7,034,535,336 instructions # 1.92 insn per cycle - 1.779301540 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.182229e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.339411e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.339411e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 0.997929 sec + 3,329,282,294 cycles # 3.328 GHz + 6,979,885,802 instructions # 2.10 insn per cycle + 1.001225403 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2605) (512y: 12) (512z: 2221) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183459779248 Relative difference = 1.7053177021099307e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 82aee2242c..7b61726f5d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:04:53 +DATE: 2024-03-01_19:24:11 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.193238e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.649659e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.969705e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.478757 sec - 2,104,839,063 cycles # 2.996 GHz - 2,995,662,279 instructions # 1.42 insn per cycle - 0.760483148 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.482809e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.574079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.574079e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.311067 sec - 12,598,770,011 cycles # 2.919 GHz - 34,372,549,657 instructions # 2.73 insn per cycle - 4.316594695 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.274947e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.391721e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.391721e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.318252 sec + 11,611,007,117 cycles # 3.497 GHz + 34,397,967,949 instructions # 2.96 insn per cycle + 3.321458141 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199094356969 Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.536780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.027176e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.027176e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.978899 sec - 6,105,197,866 cycles # 3.078 GHz - 14,859,942,037 instructions # 2.43 insn per cycle - 1.984598314 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.009263e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.579301e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.579301e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 1.609519 sec + 5,634,626,051 cycles # 3.496 GHz + 14,869,650,453 instructions # 2.64 insn per cycle + 1.612769577 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193803280592 Relative difference = 1.8746278463897685e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.439196e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.305375e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.305375e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.494763 sec - 4,316,279,907 cycles # 2.878 GHz - 9,028,948,283 instructions # 2.09 insn per cycle - 1.500523975 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.363912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.029612e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.029612e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.228350 sec + 4,099,462,044 cycles # 3.331 GHz + 9,010,087,189 instructions # 2.20 insn per cycle + 1.231663795 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181999931112 Relative difference = 9.857617164523888e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.366245e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.235578e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.235578e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.509333 sec - 4,207,142,397 cycles # 2.778 GHz - 8,663,183,236 instructions # 2.06 insn per cycle - 1.515104262 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.733550e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.074728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.074728e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.188251 sec + 3,967,821,126 cycles # 3.332 GHz + 8,612,196,683 instructions # 2.17 insn per cycle + 1.191524344 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181999931112 Relative difference = 9.857617164523888e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.816959e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.308753e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.308753e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.886655 sec - 3,832,564,290 cycles # 2.026 GHz - 7,807,000,610 instructions # 2.04 insn per cycle - 1.892395760 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.076849e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204629e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.204629e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.084935 sec + 3,621,365,276 cycles # 3.330 GHz + 7,753,075,474 instructions # 2.14 insn per cycle + 1.088240742 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4419) (512y: 0) (512z: 2556) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183246739209 Relative difference = 1.6003107281264138e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index dda1db1b3c..61d1db8a51 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_03:05:16 +DATE: 2024-03-01_19:24:24 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.270822e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.690662e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.026451e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.478497 sec - 2,092,584,267 cycles # 2.987 GHz - 2,982,481,806 instructions # 1.43 insn per cycle - 0.759974164 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.703982e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.806761e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.806761e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.962914 sec - 11,745,545,496 cycles # 2.960 GHz - 35,108,793,810 instructions # 2.99 insn per cycle - 3.968579892 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.548926e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.686522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.686522e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.071996 sec + 10,748,097,201 cycles # 3.497 GHz + 35,134,305,151 instructions # 3.27 insn per cycle + 3.075247886 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199094356969 Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.697555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.224866e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.224866e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.925244 sec - 5,962,598,726 cycles # 3.089 GHz - 14,469,931,867 instructions # 2.43 insn per cycle - 1.931094914 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.177278e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.785505e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.785505e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 1.574710 sec + 5,513,686,982 cycles # 3.496 GHz + 14,479,421,661 instructions # 2.63 insn per cycle + 1.578018917 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193583255634 Relative difference = 1.7661780742548925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.546151e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.447291e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.447291e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.475701 sec - 4,155,772,808 cycles # 2.809 GHz - 8,874,967,057 instructions # 2.14 insn per cycle - 1.481449825 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.002551e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111042e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111042e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.154884 sec + 3,857,962,551 cycles # 3.333 GHz + 8,855,937,803 instructions # 2.30 insn per cycle + 1.158168839 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182107033208 Relative difference = 1.0385521077446488e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.932743e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.882289e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.882289e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.405788 sec - 4,123,527,517 cycles # 2.923 GHz - 8,411,119,259 instructions # 2.04 insn per cycle - 1.411551419 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.549625e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.050633e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.050633e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.209387 sec + 4,037,720,483 cycles # 3.332 GHz + 8,359,219,935 instructions # 2.07 insn per cycle + 1.212702560 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182107033208 Relative difference = 1.0385521077446488e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.930692e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.444813e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.444813e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.851731 sec - 3,787,634,254 cycles # 2.040 GHz - 7,699,934,932 instructions # 2.03 insn per cycle - 1.857323010 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.088722e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.221112e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.221112e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.073794 sec + 3,584,683,392 cycles # 3.331 GHz + 7,646,262,060 instructions # 2.13 insn per cycle + 1.077090879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3435) (512y: 0) (512z: 2108) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183204829693 Relative difference = 1.5796536184903122e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 9748a5aab4..4c26b84629 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:28:51 +DATE: 2024-03-01_19:11:04 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.029545e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.136839e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273391e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.526886 sec - 2,307,341,508 cycles # 3.024 GHz - 3,271,429,537 instructions # 1.42 insn per cycle - 0.836809323 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.174399e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.238464e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.238464e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.926720 sec - 15,303,062,403 cycles # 3.103 GHz - 38,574,821,235 instructions # 2.52 insn per cycle - 4.935986004 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.726645e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.798326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.798326e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.974088 sec + 13,901,240,398 cycles # 3.496 GHz + 38,519,700,643 instructions # 2.77 insn per cycle + 3.977444803 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.750432e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.964332e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.964332e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.903163 sec - 8,984,859,488 cycles # 3.089 GHz - 24,224,163,348 instructions # 2.70 insn per cycle - 2.918366508 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.888376e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.137236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.137236e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.271352 sec + 7,950,613,495 cycles # 3.496 GHz + 24,219,096,177 instructions # 3.05 insn per cycle + 2.274776261 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.977342e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.518236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.518236e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.860423 sec - 5,396,289,064 cycles # 2.891 GHz - 11,276,510,611 instructions # 2.09 insn per cycle - 1.875091896 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.937592e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.600519e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.600519e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.442766 sec + 4,812,419,685 cycles # 3.329 GHz + 11,255,351,127 instructions # 2.34 insn per cycle + 1.446223272 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.792892e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.469147e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.469147e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.648151 sec - 4,836,682,110 cycles # 2.924 GHz - 10,524,586,299 instructions # 2.18 insn per cycle - 1.662467551 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.502777e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.260895e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.260895e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.356803 sec + 4,529,795,443 cycles # 3.332 GHz + 10,470,222,537 instructions # 2.31 insn per cycle + 1.360332208 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.224142e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.479514e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.479514e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.587933 sec - 5,228,382,592 cycles # 2.016 GHz - 7,603,380,674 instructions # 1.45 insn per cycle - 2.604403134 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.442635e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.183690e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.183690e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.366107 sec + 4,556,171,739 cycles # 3.329 GHz + 7,545,350,102 instructions # 1.66 insn per cycle + 1.369560048 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 4c3bdeb3a7..ff51ce9edf 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,206 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_02:29:18 +DATE: 2024-03-01_19:11:19 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.025642e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.140563e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276898e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529654 sec - 2,293,467,091 cycles # 2.992 GHz - 3,241,408,242 instructions # 1.41 insn per cycle - 0.836485234 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.144775e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.207356e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.207356e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.994421 sec - 15,338,753,655 cycles # 3.068 GHz - 40,369,233,372 instructions # 2.63 insn per cycle - 5.002383718 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.696678e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.767700e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.767700e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.016604 sec + 14,052,871,002 cycles # 3.497 GHz + 40,347,826,232 instructions # 2.87 insn per cycle + 4.019950872 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 683) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.003325e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.239627e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.239627e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.723159 sec - 8,478,435,163 cycles # 3.107 GHz - 23,253,497,249 instructions # 2.74 insn per cycle - 2.738604338 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 5.079311e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.348964e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.348964e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.189998 sec + 7,660,292,335 cycles # 3.494 GHz + 23,249,177,880 instructions # 3.04 insn per cycle + 2.193387914 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.181118e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.571113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.571113e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.127824 sec - 6,241,547,842 cycles # 2.925 GHz - 12,962,413,577 instructions # 2.08 insn per cycle - 2.144515260 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.802190e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.284263e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.284263e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.662935 sec + 5,537,885,797 cycles # 3.326 GHz + 12,941,082,077 instructions # 2.34 insn per cycle + 1.666384267 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.322331e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.729304e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.729304e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.074458 sec - 5,923,278,346 cycles # 2.853 GHz - 12,242,730,346 instructions # 2.07 insn per cycle - 2.086429072 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.133486e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.661662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.661662e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.593841 sec + 5,310,193,730 cycles # 3.327 GHz + 12,185,581,580 instructions # 2.29 insn per cycle + 1.597248619 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.899734e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.116034e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.116034e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.794263 sec - 5,618,790,292 cycles # 2.007 GHz - 8,743,459,975 instructions # 1.56 insn per cycle - 2.808786612 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.179196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.881876e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.881876e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.404473 sec + 4,673,986,503 cycles # 3.322 GHz + 8,684,795,820 instructions # 1.86 insn per cycle + 1.407876338 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index c4c4bff630..d6ad8dae6d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:29:46 +DATE: 2024-03-01_19:11:34 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.473707e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.045050e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.061478e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.463329 sec - 2,069,832,304 cycles # 3.002 GHz - 2,918,096,235 instructions # 1.41 insn per cycle - 0.772559551 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.045387e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.319438e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.336268e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.608947 sec - 2,562,374,732 cycles # 3.012 GHz - 3,879,371,783 instructions # 1.51 insn per cycle - 0.910123971 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.585844e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.598254e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.598254e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.359535 sec - 19,687,428,773 cycles # 3.094 GHz - 59,604,296,849 instructions # 3.03 insn per cycle - 6.365859123 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.210168e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.225481e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.225481e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.129663 sec + 17,942,023,952 cycles # 3.497 GHz + 59,485,012,848 instructions # 3.32 insn per cycle + 5.132002080 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1439) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.691737e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.735631e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.735631e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.515479 sec - 10,373,655,779 cycles # 2.948 GHz - 30,676,465,519 instructions # 2.96 insn per cycle - 3.528584808 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.227836e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.283842e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.283842e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.655716 sec + 9,284,099,234 cycles # 3.494 GHz + 30,662,650,809 instructions # 3.30 insn per cycle + 2.658111464 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.754839e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.932602e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.932602e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.702212 sec - 4,885,421,396 cycles # 2.863 GHz - 11,020,224,832 instructions # 2.26 insn per cycle - 1.717667988 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.296898e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.319348e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.319348e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.287545 sec + 4,259,821,564 cycles # 3.305 GHz + 11,003,859,405 instructions # 2.58 insn per cycle + 1.289855021 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.095884e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.117707e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.117707e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.517268 sec - 4,368,757,303 cycles # 2.872 GHz - 10,296,904,442 instructions # 2.36 insn per cycle - 1.532957385 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.392823e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.418968e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.418968e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.201286 sec + 3,975,596,259 cycles # 3.305 GHz + 10,273,857,845 instructions # 2.58 insn per cycle + 1.203631834 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.761348e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.875289e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.875289e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.135983 sec - 4,101,318,849 cycles # 1.917 GHz - 5,843,401,136 instructions # 1.42 insn per cycle - 2.151041040 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.911241e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.961026e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.961026e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.882693 sec + 2,920,749,975 cycles # 3.303 GHz + 5,817,985,024 instructions # 1.99 insn per cycle + 0.885121704 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 7a80a6327c..105cbe8fdc 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,237 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_03:14:59 +DATE: 2024-03-01_19:30:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.634181e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.802665e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.802665e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.494713 sec - 2,059,588,733 cycles # 2.926 GHz - 3,067,379,574 instructions # 1.49 insn per cycle - 0.764554853 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.715023e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.440232e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.440232e+06 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.824199 sec - 3,179,114,916 cycles # 2.965 GHz - 5,069,610,946 instructions # 1.59 insn per cycle - 1.133521853 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.525402e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.537809e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.537809e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.518056 sec - 19,750,480,394 cycles # 3.028 GHz - 59,611,727,500 instructions # 3.02 insn per cycle - 6.522447301 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.215926e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.231294e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.231294e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.124663 sec + 17,917,725,694 cycles # 3.495 GHz + 59,493,244,220 instructions # 3.32 insn per cycle + 5.127126814 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1439) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.903232e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.949588e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.949588e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.370584 sec - 10,396,817,898 cycles # 3.081 GHz - 30,723,473,589 instructions # 2.96 insn per cycle - 3.375008450 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.222963e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.278776e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.278776e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.663023 sec + 9,311,853,268 cycles # 3.495 GHz + 30,712,911,054 instructions # 3.30 insn per cycle + 2.665569418 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.888216e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.006946e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.006946e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.685691 sec - 4,902,930,827 cycles # 2.902 GHz - 11,066,989,869 instructions # 2.26 insn per cycle - 1.690115997 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.294133e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.316415e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.316415e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.295128 sec + 4,286,401,261 cycles # 3.305 GHz + 11,055,645,863 instructions # 2.58 insn per cycle + 1.297609303 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.103682e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.126401e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.126401e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.513774 sec - 4,402,683,305 cycles # 2.901 GHz - 10,346,890,880 instructions # 2.35 insn per cycle - 1.518250177 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.391259e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.417196e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.417196e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.206857 sec + 3,995,174,773 cycles # 3.306 GHz + 10,322,661,719 instructions # 2.58 insn per cycle + 1.209297943 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.798042e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.913691e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.913691e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.132010 sec - 4,131,468,761 cycles # 1.935 GHz - 5,881,941,509 instructions # 1.42 insn per cycle - 2.136586909 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.907041e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.955766e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.955766e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.888605 sec + 2,942,234,200 cycles # 3.305 GHz + 5,854,009,429 instructions # 1.99 insn per cycle + 0.891034303 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 90bf6e6455..6001d7f706 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:30:15 +DATE: 2024-03-01_19:11:50 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.404765e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.032804e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.048930e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.465265 sec - 2,029,896,808 cycles # 2.980 GHz - 2,854,741,238 instructions # 1.41 insn per cycle - 0.763772288 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.033730e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.306062e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.322624e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.607194 sec - 2,545,937,909 cycles # 2.996 GHz - 3,826,405,631 instructions # 1.50 insn per cycle - 0.909330494 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.602792e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.615496e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.615496e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.317260 sec - 19,445,883,412 cycles # 3.076 GHz - 58,795,735,881 instructions # 3.02 insn per cycle - 6.323702590 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.239919e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.255416e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.255416e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.082609 sec + 17,776,981,365 cycles # 3.497 GHz + 58,797,467,293 instructions # 3.31 insn per cycle + 5.084873713 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1323) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.903926e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.950247e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.950247e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.363533 sec - 10,256,448,579 cycles # 3.046 GHz - 30,347,165,405 instructions # 2.96 insn per cycle - 3.377280590 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.233270e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.289870e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.289870e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.653124 sec + 9,279,704,528 cycles # 3.496 GHz + 30,333,640,947 instructions # 3.27 insn per cycle + 2.655395297 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.598787e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.768674e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.768674e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.728674 sec - 5,043,692,461 cycles # 2.911 GHz - 11,484,727,811 instructions # 2.28 insn per cycle - 1.738921569 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.249037e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.269785e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269785e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.336298 sec + 4,419,712,116 cycles # 3.304 GHz + 11,468,590,102 instructions # 2.59 insn per cycle + 1.338622545 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.033952e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.054066e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.054066e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.607009 sec - 4,642,681,786 cycles # 2.882 GHz - 10,842,961,046 instructions # 2.34 insn per cycle - 1.618440779 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.321449e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.345024e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.345024e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.264700 sec + 4,183,889,364 cycles # 3.304 GHz + 10,818,738,539 instructions # 2.59 insn per cycle + 1.266987637 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.765124e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.875111e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.875111e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.134046 sec - 4,109,311,958 cycles # 1.922 GHz - 6,106,472,133 instructions # 1.49 insn per cycle - 2.145705149 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.893042e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.941104e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.941104e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.890295 sec + 2,945,907,544 cycles # 3.303 GHz + 6,079,729,396 instructions # 2.06 insn per cycle + 0.892645568 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index af4f474b65..b334186e41 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:30:44 +DATE: 2024-03-01_19:12:06 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.308616e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.230427e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.340211e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.445727 sec - 2,001,558,197 cycles # 3.000 GHz - 2,820,746,449 instructions # 1.41 insn per cycle - 0.736568143 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.061859e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.424190e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.524056e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.500107 sec - 2,158,124,631 cycles # 2.977 GHz - 3,092,829,809 instructions # 1.43 insn per cycle - 0.784432881 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.674607e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.688116e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.688116e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.146873 sec - 19,061,096,774 cycles # 3.099 GHz - 58,958,014,215 instructions # 3.09 insn per cycle - 6.153306662 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.311464e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.328435e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.328435e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 4.972112 sec + 17,386,645,192 cycles # 3.496 GHz + 58,906,569,186 instructions # 3.39 insn per cycle + 4.974344840 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1027) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 Avg ME (F77/C++) = 1.4129858051842916 Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.781065e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.932207e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.932207e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.886682 sec - 5,850,782,122 cycles # 3.096 GHz - 16,695,269,066 instructions # 2.85 insn per cycle - 1.898716135 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.100537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.119548e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.119548e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724758e+02 +- 2.665339e+02 ) GeV^-2 +TOTAL : 1.511800 sec + 5,288,183,416 cycles # 3.495 GHz + 16,687,010,204 instructions # 3.16 insn per cycle + 1.514049543 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412987e+00 Avg ME (F77/C++) = 1.4129865669244737 Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.892145e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.960485e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.960485e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.886334 sec - 2,581,461,055 cycles # 2.900 GHz - 5,980,838,355 instructions # 2.32 insn per cycle - 0.901108038 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.462224e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.557798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.557798e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 +TOTAL : 0.688318 sec + 2,281,074,327 cycles # 3.307 GHz + 5,967,081,727 instructions # 2.62 insn per cycle + 0.690552036 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.036523e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.118274e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.118274e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.825324 sec - 2,349,134,788 cycles # 2.832 GHz - 5,603,128,082 instructions # 2.39 insn per cycle - 0.837493797 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.669958e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.775682e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.775682e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 +TOTAL : 0.637008 sec + 2,111,600,396 cycles # 3.307 GHz + 5,581,688,810 instructions # 2.64 insn per cycle + 0.639312316 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.468368e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.511305e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.511305e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.138775 sec - 2,054,810,359 cycles # 1.798 GHz - 3,334,038,485 instructions # 1.62 insn per cycle - 1.149410848 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.932137e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.167092e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.167092e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743730e+02 +- 2.676609e+02 ) GeV^-2 +TOTAL : 0.440183 sec + 1,458,007,499 cycles # 3.301 GHz + 3,311,390,347 instructions # 2.27 insn per cycle + 0.442462707 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133164033579249 Relative difference = 2.85398258307829e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index f62f4c8cdf..dc7aef3b25 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,237 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_03:15:29 +DATE: 2024-03-01_19:30:24 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.995753e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.112595e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.112595e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.451281 sec - 1,977,131,537 cycles # 2.986 GHz - 2,910,150,577 instructions # 1.47 insn per cycle - 0.718929629 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.708417e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.567455e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.567455e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.637857 sec - 2,608,085,808 cycles # 2.999 GHz - 3,961,129,191 instructions # 1.52 insn per cycle - 0.928114705 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.667614e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.681311e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.681311e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.166590 sec - 19,068,958,964 cycles # 3.091 GHz - 58,962,429,433 instructions # 3.09 insn per cycle - 6.170849448 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.313699e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.330593e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.330593e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 4.970690 sec + 17,398,259,542 cycles # 3.499 GHz + 58,910,916,539 instructions # 3.39 insn per cycle + 4.972903941 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1027) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 Avg ME (F77/C++) = 1.4129858051842916 Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.742153e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.893438e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.893438e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.898339 sec - 5,876,062,473 cycles # 3.090 GHz - 16,741,995,731 instructions # 2.85 insn per cycle - 1.902713080 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.100372e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.119384e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.119384e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724758e+02 +- 2.665339e+02 ) GeV^-2 +TOTAL : 1.515113 sec + 5,305,885,790 cycles # 3.498 GHz + 16,735,297,320 instructions # 3.15 insn per cycle + 1.517362548 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412987e+00 Avg ME (F77/C++) = 1.4129865669244737 Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.880787e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.949754e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.949754e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.895765 sec - 2,600,620,319 cycles # 2.891 GHz - 6,016,590,564 instructions # 2.31 insn per cycle - 0.900189489 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.343773e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.429942e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.429942e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 +TOTAL : 0.724732 sec + 2,402,094,625 cycles # 3.307 GHz + 6,003,946,640 instructions # 2.50 insn per cycle + 0.727046765 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.084629e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.167676e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.167676e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.810420 sec - 2,363,958,510 cycles # 2.904 GHz - 5,639,045,986 instructions # 2.39 insn per cycle - 0.814799834 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.660234e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.764784e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.764784e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 +TOTAL : 0.642041 sec + 2,128,904,938 cycles # 3.308 GHz + 5,618,859,132 instructions # 2.64 insn per cycle + 0.644359266 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.603454e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.652417e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.652417e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.048212 sec - 2,071,251,869 cycles # 1.970 GHz - 3,374,799,702 instructions # 1.63 insn per cycle - 1.052574627 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.928773e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.164302e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.164302e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743730e+02 +- 2.676609e+02 ) GeV^-2 +TOTAL : 0.443516 sec + 1,473,489,767 cycles # 3.311 GHz + 3,352,410,215 instructions # 2.28 insn per cycle + 0.445839527 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133164033579249 Relative difference = 2.85398258307829e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index b43a9401e8..15e9b2f227 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:31:09 +DATE: 2024-03-01_19:12:19 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.359219e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.312667e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.422625e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.446885 sec - 1,972,174,797 cycles # 2.962 GHz - 2,746,314,290 instructions # 1.39 insn per cycle - 0.738224654 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.060800e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.419962e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.520064e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.497273 sec - 2,176,246,033 cycles # 3.004 GHz - 3,133,180,341 instructions # 1.44 insn per cycle - 0.782102946 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.676079e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.689805e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.689805e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.143350 sec - 18,995,848,931 cycles # 3.090 GHz - 58,700,265,502 instructions # 3.09 insn per cycle - 6.150073952 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.323721e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.340795e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.340795e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 4.954042 sec + 17,317,128,762 cycles # 3.495 GHz + 58,675,021,220 instructions # 3.39 insn per cycle + 4.956261990 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1024) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 Avg ME (F77/C++) = 1.4129858051842916 Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.180884e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.346917e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.346917e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.804269 sec - 5,584,642,506 cycles # 3.088 GHz - 16,510,962,038 instructions # 2.96 insn per cycle - 1.819572816 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.165888e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.187242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.187242e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724758e+02 +- 2.665339e+02 ) GeV^-2 +TOTAL : 1.428429 sec + 4,993,014,379 cycles # 3.492 GHz + 16,503,883,029 instructions # 3.31 insn per cycle + 1.430662597 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412987e+00 Avg ME (F77/C++) = 1.4129865669244737 Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634306e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.685973e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.685973e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.022630 sec - 2,975,513,176 cycles # 2.898 GHz - 6,634,498,276 instructions # 2.23 insn per cycle - 1.034400565 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.140223e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.212640e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.212640e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 +TOTAL : 0.788410 sec + 2,610,891,292 cycles # 3.306 GHz + 6,621,195,509 instructions # 2.54 insn per cycle + 0.790692475 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.769784e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.829611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.829611e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.945795 sec - 2,752,522,160 cycles # 2.898 GHz - 6,256,039,450 instructions # 2.27 insn per cycle - 0.961442115 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.287783e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.364522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.364522e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 +TOTAL : 0.739428 sec + 2,449,362,633 cycles # 3.306 GHz + 6,234,567,338 instructions # 2.55 insn per cycle + 0.741705444 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.392018e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.430701e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.430701e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.200320 sec - 2,230,572,619 cycles # 1.852 GHz - 3,698,329,997 instructions # 1.66 insn per cycle - 1.213663484 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.476688e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.662734e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.662734e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743730e+02 +- 2.676609e+02 ) GeV^-2 +TOTAL : 0.494659 sec + 1,638,048,070 cycles # 3.302 GHz + 3,675,411,384 instructions # 2.24 insn per cycle + 0.496969912 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2378) (512y: 29) (512z: 3963) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133164033579249 Relative difference = 2.85398258307829e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 568d6c4513..d6556bb1be 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:31:34 +DATE: 2024-03-01_19:12:32 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.426575e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.039569e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.055629e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.463709 sec - 2,071,639,040 cycles # 3.004 GHz - 2,941,031,538 instructions # 1.42 insn per cycle - 0.764842159 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.035948e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.309187e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325703e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.608855 sec - 2,552,084,280 cycles # 3.004 GHz - 3,794,047,088 instructions # 1.49 insn per cycle - 0.909216297 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.546543e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.558753e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.558753e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.456566 sec - 20,000,355,725 cycles # 3.096 GHz - 60,532,425,335 instructions # 3.03 insn per cycle - 6.462989015 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.149728e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.164397e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.164397e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.227650 sec + 18,278,883,667 cycles # 3.496 GHz + 60,529,183,661 instructions # 3.31 insn per cycle + 5.230024763 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1404) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.015629e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.062224e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.062224e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.288178 sec - 10,191,043,016 cycles # 3.096 GHz - 30,384,591,666 instructions # 2.98 insn per cycle - 3.302408299 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.312678e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.370148e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.370148e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.620058 sec + 9,163,068,711 cycles # 3.495 GHz + 30,372,553,579 instructions # 3.31 insn per cycle + 2.622366147 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.844182e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.002719e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.002719e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.686926 sec - 4,874,678,301 cycles # 2.883 GHz - 10,979,160,826 instructions # 2.25 insn per cycle - 1.698730583 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.306653e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.329616e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.329616e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.278379 sec + 4,229,813,296 cycles # 3.305 GHz + 10,963,305,485 instructions # 2.59 insn per cycle + 1.280804887 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.132241e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155783e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.155783e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.469271 sec - 4,278,421,569 cycles # 2.904 GHz - 10,248,685,624 instructions # 2.40 insn per cycle - 1.480280367 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.415509e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.442464e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.442464e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.182291 sec + 3,912,618,342 cycles # 3.305 GHz + 10,225,163,393 instructions # 2.61 insn per cycle + 1.184557374 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4279) (512y: 82) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.587751e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.694540e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.694540e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.183850 sec - 4,204,822,902 cycles # 1.923 GHz - 6,044,506,630 instructions # 1.44 insn per cycle - 2.192719745 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.830365e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.874980e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.874980e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.920395 sec + 3,040,685,755 cycles # 3.298 GHz + 6,018,479,747 instructions # 1.98 insn per cycle + 0.922695458 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213786174055 Relative difference = 4.3972324717191576e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 2001d2a062..29df44b487 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_02:32:03 +DATE: 2024-03-01_19:12:48 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.409979e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.033107e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.049247e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.461655 sec - 2,079,301,655 cycles # 3.013 GHz - 2,945,288,445 instructions # 1.42 insn per cycle - 0.761228896 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.037338e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.304237e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.318241e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.603998 sec - 2,550,056,991 cycles # 3.016 GHz - 3,770,712,997 instructions # 1.48 insn per cycle - 0.905342631 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.536387e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.548597e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.548597e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.482109 sec - 19,897,203,281 cycles # 3.068 GHz - 59,934,079,759 instructions # 3.01 insn per cycle - 6.488470935 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.185651e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.200819e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.200819e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.168749 sec + 18,081,178,947 cycles # 3.497 GHz + 59,877,727,481 instructions # 3.31 insn per cycle + 5.171016307 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1262) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.079933e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.127366e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.127366e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.246582 sec - 10,068,513,741 cycles # 3.097 GHz - 30,097,905,174 instructions # 2.99 insn per cycle - 3.264343936 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.319458e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.377020e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.377020e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.617395 sec + 9,156,723,314 cycles # 3.496 GHz + 30,085,782,756 instructions # 3.29 insn per cycle + 2.619742420 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.599229e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.768469e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.768469e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.728964 sec - 5,016,079,762 cycles # 2.895 GHz - 11,483,054,886 instructions # 2.29 insn per cycle - 1.742427809 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.247849e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.268698e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.268698e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.337352 sec + 4,424,282,737 cycles # 3.304 GHz + 11,463,474,174 instructions # 2.59 insn per cycle + 1.339740203 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4717) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.051243e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.071758e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.071758e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.580395 sec - 4,590,869,899 cycles # 2.898 GHz - 10,811,034,467 instructions # 2.35 insn per cycle - 1.596114627 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.341509e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.365682e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.365682e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.246200 sec + 4,123,785,215 cycles # 3.305 GHz + 10,787,122,633 instructions # 2.62 insn per cycle + 1.248520950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.586932e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.694563e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.694563e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.184061 sec - 4,216,157,602 cycles # 1.927 GHz - 6,273,944,868 instructions # 1.49 insn per cycle - 2.195028764 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.838145e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.883823e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.883823e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.916566 sec + 3,029,597,137 cycles # 3.300 GHz + 6,246,692,057 instructions # 2.06 insn per cycle + 0.918942029 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213786174055 Relative difference = 4.3972324717191576e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index c4f627d4b9..a3ce2c07d0 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:32:32 +DATE: 2024-03-01_19:13:03 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.456101e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.489020e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.491439e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.526891 sec - 2,312,216,646 cycles # 3.007 GHz - 3,538,385,257 instructions # 1.53 insn per cycle - 0.841955777 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.122556e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.158071e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.159487e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.037875 sec - 10,086,152,870 cycles # 3.059 GHz - 22,511,661,776 instructions # 2.23 insn per cycle - 3.352868148 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.962967e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.963888e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.963888e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.365178 sec - 25,629,682,297 cycles # 3.063 GHz - 78,935,463,104 instructions # 3.08 insn per cycle - 8.371779038 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.481783e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.483017e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.483017e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.616014 sec + 23,134,926,646 cycles # 3.496 GHz + 78,769,826,090 instructions # 3.40 insn per cycle + 6.618301143 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.775994e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.779313e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.779313e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.352554 sec - 12,920,825,541 cycles # 2.966 GHz - 39,280,019,197 instructions # 3.04 insn per cycle - 4.370436126 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.886296e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.890965e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.890965e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.363857 sec + 11,665,465,951 cycles # 3.466 GHz + 39,273,332,129 instructions # 3.37 insn per cycle + 3.366166984 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.587371e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.605210e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.605210e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.920439 sec - 5,577,220,412 cycles # 2.899 GHz - 13,686,699,383 instructions # 2.45 insn per cycle - 1.933532640 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.126568e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.128873e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.128873e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.462899 sec + 4,833,071,970 cycles # 3.300 GHz + 13,680,344,924 instructions # 2.83 insn per cycle + 1.465169066 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.660129e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.682450e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.682450e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.708010 sec - 4,898,677,790 cycles # 2.863 GHz - 12,341,670,637 instructions # 2.52 insn per cycle - 1.722166284 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.280240e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.283273e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.283273e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.288270 sec + 4,256,880,431 cycles # 3.300 GHz + 12,334,305,617 instructions # 2.90 insn per cycle + 1.290601255 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.531084e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.544719e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.544719e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.187284 sec - 4,109,191,778 cycles # 1.875 GHz - 6,335,550,253 instructions # 1.54 insn per cycle - 2.200752564 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.023870e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.031547e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.031547e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.817740 sec + 2,700,603,798 cycles # 3.296 GHz + 6,324,447,395 instructions # 2.34 insn per cycle + 0.820100253 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 8d1778e673..1d56a4470b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,237 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:16:28 +DATE: 2024-03-01_19:30:51 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.142985e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.469804e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.469804e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.511155 sec - 2,228,194,908 cycles # 3.016 GHz - 3,541,287,827 instructions # 1.59 insn per cycle - 0.799045956 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.621948e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.093950e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.093950e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.305480 sec - 10,998,775,521 cycles # 3.077 GHz - 24,493,841,360 instructions # 2.23 insn per cycle - 3.633710964 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.956691e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.957671e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.957671e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.395628 sec - 25,661,453,890 cycles # 3.059 GHz - 78,946,626,848 instructions # 3.08 insn per cycle - 8.400144517 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.481958e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.483200e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.483200e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.617892 sec + 23,143,239,754 cycles # 3.496 GHz + 78,775,014,566 instructions # 3.40 insn per cycle + 6.620381527 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.779486e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.783121e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.783121e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.352704 sec - 12,939,532,043 cycles # 2.970 GHz - 39,292,271,047 instructions # 3.04 insn per cycle - 4.357352756 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.816064e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.820578e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.820578e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.415562 sec + 11,674,359,157 cycles # 3.416 GHz + 39,286,928,254 instructions # 3.37 insn per cycle + 3.418065939 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.560149e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.578951e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.578951e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.929060 sec - 5,589,750,479 cycles # 2.892 GHz - 13,696,577,373 instructions # 2.45 insn per cycle - 1.933630865 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.129360e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.131693e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.131693e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.461769 sec + 4,829,397,024 cycles # 3.300 GHz + 13,689,355,371 instructions # 2.83 insn per cycle + 1.464289743 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.749338e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.772565e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.772565e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.695619 sec - 4,910,055,408 cycles # 2.889 GHz - 12,351,492,799 instructions # 2.52 insn per cycle - 1.700097015 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.278771e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.281807e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.281807e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.292785 sec + 4,271,370,436 cycles # 3.300 GHz + 12,345,376,003 instructions # 2.89 insn per cycle + 1.295310523 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.621116e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.636094e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.636094e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.165843 sec - 4,123,850,554 cycles # 1.901 GHz - 6,345,407,560 instructions # 1.54 insn per cycle - 2.170297070 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.021965e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.029524e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.029524e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.821349 sec + 2,711,016,004 cycles # 3.294 GHz + 6,335,916,291 instructions # 2.34 insn per cycle + 0.823888580 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 597fd5665a..dc8d1365ef 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:28:08 +DATE: 2024-03-01_19:35:49 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.502974e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.532224e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.534544e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.505991 sec - 2,242,092,583 cycles # 3.014 GHz - 3,466,791,908 instructions # 1.55 insn per cycle - 0.811853126 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.137461e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.171030e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.172456e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.124130 sec - 10,356,034,147 cycles # 3.069 GHz - 23,417,816,833 instructions # 2.26 insn per cycle - 3.433693053 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.957351e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.958278e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.958278e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.481844e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.483078e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.483078e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.389537 sec - 25,646,805,438 cycles # 3.056 GHz - 78,935,262,340 instructions # 3.08 insn per cycle - 8.393631651 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.615618 sec + 23,132,699,978 cycles # 3.496 GHz + 78,769,037,054 instructions # 3.41 insn per cycle + 6.617829059 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.762997e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.766514e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.766514e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.815456e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.819978e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.819978e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.369422 sec - 12,916,153,129 cycles # 2.954 GHz - 39,278,867,860 instructions # 3.04 insn per cycle - 4.373667823 seconds time elapsed +TOTAL : 3.413181 sec + 11,662,452,031 cycles # 3.415 GHz + 39,273,293,393 instructions # 3.37 insn per cycle + 3.415398215 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.528032e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.546362e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.546362e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.130382e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.132719e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.132719e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.933878 sec - 5,580,678,683 cycles # 2.881 GHz - 13,684,529,284 instructions # 2.45 insn per cycle - 1.937965494 seconds time elapsed +TOTAL : 1.458091 sec + 4,816,775,693 cycles # 3.300 GHz + 13,680,360,383 instructions # 2.84 insn per cycle + 1.460366647 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.723484e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.746463e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.746463e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.278483e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.281512e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.281512e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.697628 sec - 4,903,453,092 cycles # 2.882 GHz - 12,338,806,795 instructions # 2.52 insn per cycle - 1.701856837 seconds time elapsed +TOTAL : 1.290121 sec + 4,262,527,735 cycles # 3.300 GHz + 12,334,714,107 instructions # 2.89 insn per cycle + 1.292329143 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.314965e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.328200e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.328200e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.025121e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.032750e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.032750e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.253262 sec - 4,111,107,725 cycles # 1.822 GHz - 6,332,329,650 instructions # 1.54 insn per cycle - 2.257544828 seconds time elapsed +TOTAL : 0.817158 sec + 2,696,851,480 cycles # 3.294 GHz + 6,324,641,682 instructions # 2.35 insn per cycle + 0.819432272 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 60e01cd2dd..93293d216b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,220 +1,138 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:24:50 +DATE: 2024-03-01_19:34:23 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.510827e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.539312e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.541615e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.503655 sec - 2,239,000,994 cycles # 3.024 GHz - 3,553,306,239 instructions # 1.59 insn per cycle - 0.813367897 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.145153e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.179407e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.180837e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.061657 sec - 10,188,245,124 cycles # 3.074 GHz - 23,248,414,020 instructions # 2.28 insn per cycle - 3.370951944 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.938252e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.939166e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.939166e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.471636 sec - 25,650,928,170 cycles # 3.027 GHz - 78,935,761,644 instructions # 3.08 insn per cycle - 8.475777896 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe: Aborted + 4,606,961 cycles # 3.257 GHz + 6,302,943 instructions # 1.37 insn per cycle + 0.037961845 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.732481e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.735838e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.735838e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.403548 sec - 12,924,361,173 cycles # 2.933 GHz - 39,279,334,894 instructions # 3.04 insn per cycle - 4.407811208 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe: Aborted + 4,632,503 cycles # 2.696 GHz + 6,332,916 instructions # 1.37 insn per cycle + 0.038722425 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.485088e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.502714e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.502714e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.942219 sec - 5,571,920,631 cycles # 2.864 GHz - 13,685,480,241 instructions # 2.46 insn per cycle - 1.946449782 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe: Aborted + 4,502,042 cycles # 2.684 GHz + 6,323,077 instructions # 1.40 insn per cycle + 0.038401230 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.737761e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.761950e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.761950e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.693500 sec - 4,894,918,115 cycles # 2.884 GHz - 12,340,665,409 instructions # 2.52 insn per cycle - 1.697702233 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe: Aborted + 4,898,492 cycles # 3.254 GHz + 6,342,668 instructions # 1.29 insn per cycle + 0.037656802 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.532631e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.547658e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.547658e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.186925 sec - 4,105,530,431 cycles # 1.874 GHz - 6,333,977,995 instructions # 1.54 insn per cycle - 2.191453097 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe: Aborted + 4,865,885 cycles # 3.245 GHz + 6,331,116 instructions # 1.30 insn per cycle + 0.038706032 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index de32359ede..f7105fde21 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,223 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:21:36 +DATE: 2024-03-01_19:33:23 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.198300e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.499375e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.501597e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.508517 sec - 2,246,531,629 cycles # 3.011 GHz - 3,559,465,442 instructions # 1.58 insn per cycle - 0.806328345 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.741268e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.175443e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.176848e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.195111 sec - 10,565,694,760 cycles # 3.061 GHz - 24,272,327,456 instructions # 2.30 insn per cycle - 3.508790742 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.950947e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.951893e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.951893e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.415718 sec - 25,630,796,247 cycles # 3.044 GHz - 78,935,144,677 instructions # 3.08 insn per cycle - 8.419920398 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.481824e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.483072e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.483072e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.615817 sec + 23,135,149,155 cycles # 3.496 GHz + 78,769,523,152 instructions # 3.40 insn per cycle + 6.618024105 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.749651e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.752979e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.752979e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.383944 sec - 12,941,364,841 cycles # 2.950 GHz - 39,279,009,350 instructions # 3.04 insn per cycle - 4.388336169 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.822215e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.826748e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.826748e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.408335 sec + 11,661,748,107 cycles # 3.420 GHz + 39,273,222,931 instructions # 3.37 insn per cycle + 3.410571683 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.444820e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.462277e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.462277e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.951803 sec - 5,576,482,664 cycles # 2.852 GHz - 13,685,505,947 instructions # 2.45 insn per cycle - 1.956019187 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.127496e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.129817e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.129817e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.461766 sec + 4,828,963,686 cycles # 3.300 GHz + 13,679,994,736 instructions # 2.83 insn per cycle + 1.464026212 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.751887e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.775334e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.775334e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.690955 sec - 4,892,330,509 cycles # 2.888 GHz - 12,340,572,549 instructions # 2.52 insn per cycle - 1.695111197 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.278160e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.281173e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.281173e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.290276 sec + 4,260,419,952 cycles # 3.298 GHz + 12,334,650,697 instructions # 2.90 insn per cycle + 1.292701283 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.643060e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.657306e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.657306e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.155567 sec - 4,105,793,778 cycles # 1.902 GHz - 6,333,858,387 instructions # 1.54 insn per cycle - 2.159935327 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.023476e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.031069e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.031069e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.817764 sec + 2,700,756,236 cycles # 3.297 GHz + 6,324,382,140 instructions # 2.34 insn per cycle + 0.819998960 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 836b2fd223..cad62f799d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:33:08 +DATE: 2024-03-01_19:13:22 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.456815e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.489621e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.492178e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.523446 sec - 2,259,779,898 cycles # 2.994 GHz - 3,514,783,609 instructions # 1.56 insn per cycle - 0.830655921 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.127813e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.161921e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.163304e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.027147 sec - 10,102,095,677 cycles # 3.066 GHz - 22,774,733,235 instructions # 2.25 insn per cycle - 3.352533111 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.968945e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.969930e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.969930e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.342362 sec - 25,562,894,530 cycles # 3.064 GHz - 78,707,498,900 instructions # 3.08 insn per cycle - 8.350709191 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.503316e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.504578e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.504578e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.559386 sec + 22,937,995,811 cycles # 3.496 GHz + 78,503,071,109 instructions # 3.42 insn per cycle + 6.561632048 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4246) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.758058e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.761397e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.761397e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.374701 sec - 12,919,245,066 cycles # 2.951 GHz - 39,226,355,054 instructions # 3.04 insn per cycle - 4.387657418 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.721965e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.726341e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.726341e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.480511 sec + 11,746,156,846 cycles # 3.373 GHz + 39,219,964,596 instructions # 3.34 insn per cycle + 3.482689146 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.289947e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.307265e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.307265e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.987975 sec - 5,629,143,308 cycles # 2.825 GHz - 13,800,788,871 instructions # 2.45 insn per cycle - 1.999251955 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.134275e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.136627e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.136627e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.452965 sec + 4,799,994,769 cycles # 3.300 GHz + 13,794,367,131 instructions # 2.87 insn per cycle + 1.455293281 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.607973e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.629961e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.629961e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.716692 sec - 4,942,228,477 cycles # 2.873 GHz - 12,466,581,724 instructions # 2.52 insn per cycle - 1.728222884 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.263188e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.266183e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.266183e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.305376 sec + 4,313,294,929 cycles # 3.300 GHz + 12,459,449,195 instructions # 2.89 insn per cycle + 1.307767380 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.633414e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.646913e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.646913e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.159145 sec - 4,117,977,410 cycles # 1.904 GHz - 6,458,802,297 instructions # 1.57 insn per cycle - 2.172057894 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.017201e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.024979e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.024979e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.820329 sec + 2,710,241,576 cycles # 3.297 GHz + 6,448,202,008 instructions # 2.38 insn per cycle + 0.822690390 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 5cb26f1dc5..70731b668b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:05:40 +DATE: 2024-03-01_19:24:37 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.234238e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.262824e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.264818e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.529504 sec - 2,311,611,520 cycles # 3.006 GHz - 3,548,053,349 instructions # 1.53 insn per cycle - 0.826491750 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.771596e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.800183e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.801376e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.298192 sec - 10,832,117,508 cycles # 3.051 GHz - 23,123,371,744 instructions # 2.13 insn per cycle - 3.609870208 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.437828e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.438319e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.438319e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 36.966049 sec - 113,615,073,618 cycles # 3.074 GHz - 144,968,095,911 instructions # 1.28 insn per cycle - 36.970400514 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21301) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 5.040051e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.040564e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.040564e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 32.548023 sec + 113,830,166,208 cycles # 3.497 GHz + 143,883,071,750 instructions # 1.26 insn per cycle + 32.550633058 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21053) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140450E-004 Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.281454e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.284254e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.284254e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.007790 sec - 14,730,075,423 cycles # 2.939 GHz - 37,574,123,368 instructions # 2.55 insn per cycle - 5.012256986 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.954257e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.957301e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.957301e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 4.155080 sec + 14,533,662,034 cycles # 3.496 GHz + 37,569,403,247 instructions # 2.58 insn per cycle + 4.157544496 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141209E-004 Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.743950e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.758262e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.758262e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.127650 sec - 6,163,100,705 cycles # 2.892 GHz - 13,061,449,928 instructions # 2.12 insn per cycle - 2.132187716 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.045261e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.060223e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.060223e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.820401 sec + 6,012,541,840 cycles # 3.300 GHz + 13,057,135,841 instructions # 2.17 insn per cycle + 1.822679574 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.460039e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.482215e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.482215e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.743142 sec - 5,059,957,423 cycles # 2.897 GHz - 11,440,000,239 instructions # 2.26 insn per cycle - 1.747501406 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.099476e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.101705e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.101705e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.499045 sec + 4,952,169,341 cycles # 3.300 GHz + 11,434,777,878 instructions # 2.31 insn per cycle + 1.501339023 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.938377e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.953416e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.953416e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.075865 sec - 3,979,244,183 cycles # 1.914 GHz - 5,942,139,795 instructions # 1.49 insn per cycle - 2.080305520 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.995108e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.002501e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.002501e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.829410 sec + 2,740,060,865 cycles # 3.297 GHz + 5,933,015,109 instructions # 2.17 insn per cycle + 0.831788683 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index afca4b7953..37ef5ca493 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:06:48 +DATE: 2024-03-01_19:25:24 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.244633e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.273686e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.275983e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.531287 sec - 2,311,991,159 cycles # 3.015 GHz - 3,584,221,599 instructions # 1.55 insn per cycle - 0.825938734 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.793538e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.821908e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.823116e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.269849 sec - 10,805,743,512 cycles # 3.068 GHz - 25,084,175,459 instructions # 2.32 insn per cycle - 3.579404730 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.412070e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.412546e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.412546e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.253529 sec - 114,121,742,420 cycles # 3.069 GHz - 145,689,073,244 instructions # 1.28 insn per cycle - 37.257693750 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:22559) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.995413e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.995921e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.995921e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 32.838703 sec + 114,844,431,122 cycles # 3.497 GHz + 144,446,965,492 instructions # 1.26 insn per cycle + 32.841012912 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:22369) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140450E-004 Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.198627e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.201180e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.201180e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.136766 sec - 15,152,451,249 cycles # 2.948 GHz - 37,761,291,325 instructions # 2.49 insn per cycle - 5.141156615 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.843711e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.846578e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.846578e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 4.274359 sec + 14,949,342,714 cycles # 3.496 GHz + 37,756,081,643 instructions # 2.53 insn per cycle + 4.276658422 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141209E-004 Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.950126e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.965335e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.965335e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.072422 sec - 6,013,210,013 cycles # 2.896 GHz - 12,895,807,400 instructions # 2.14 insn per cycle - 2.076740513 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.243220e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.258795e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.258795e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.781599 sec + 5,884,681,951 cycles # 3.300 GHz + 12,892,638,092 instructions # 2.19 insn per cycle + 1.783976623 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.394633e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.416357e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.416357e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.755119 sec - 5,091,337,522 cycles # 2.895 GHz - 11,446,622,503 instructions # 2.25 insn per cycle - 1.759562583 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.089187e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.091372e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.091372e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.513045 sec + 4,998,315,786 cycles # 3.300 GHz + 11,441,056,031 instructions # 2.29 insn per cycle + 1.515430361 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.001850e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.017431e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.017431e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.059473 sec - 3,944,538,203 cycles # 1.912 GHz - 5,896,184,476 instructions # 1.49 insn per cycle - 2.063940696 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.009625e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.017155e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.017155e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.823716 sec + 2,720,763,680 cycles # 3.298 GHz + 5,887,165,290 instructions # 2.16 insn per cycle + 0.826091686 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 082176c355..d61f6be5ab 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:33:45 +DATE: 2024-03-01_19:13:41 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.331619e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.392833e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.401451e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.481440 sec - 2,077,514,231 cycles # 2.979 GHz - 3,093,505,744 instructions # 1.49 insn per cycle - 0.777796663 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.622317e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.697439e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.700567e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.713365 sec - 5,944,272,538 cycles # 3.053 GHz - 12,632,277,461 instructions # 2.13 insn per cycle - 2.004079656 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.049682e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.050694e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.050694e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.010109 sec - 24,614,432,061 cycles # 3.072 GHz - 78,126,558,251 instructions # 3.17 insn per cycle - 8.016891762 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.541023e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.542272e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.542272e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.472852 sec + 22,594,942,433 cycles # 3.496 GHz + 78,109,438,979 instructions # 3.46 insn per cycle + 6.475034460 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.386833e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.400650e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.400650e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.228676 sec - 6,461,822,382 cycles # 2.894 GHz - 20,120,855,558 instructions # 3.11 insn per cycle - 2.241648353 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.868488e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.888089e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.888089e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 +TOTAL : 1.680412 sec + 5,721,227,554 cycles # 3.426 GHz + 20,115,812,778 instructions # 3.52 insn per cycle + 1.682695212 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.671811e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.678370e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.678370e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.990019 sec - 2,821,251,649 cycles # 2.839 GHz - 6,989,221,748 instructions # 2.48 insn per cycle - 1.002444816 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.195033e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204366e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204366e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.753674 sec + 2,492,228,979 cycles # 3.300 GHz + 6,983,768,704 instructions # 2.80 insn per cycle + 0.755942549 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.922237e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.931217e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.931217e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.861179 sec - 2,488,986,957 cycles # 2.876 GHz - 6,296,476,670 instructions # 2.53 insn per cycle - 0.887481911 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.521876e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.534107e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.534107e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.656748 sec + 2,172,255,103 cycles # 3.300 GHz + 6,290,065,451 instructions # 2.90 insn per cycle + 0.658959491 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.534197e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.539839e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.539839e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.078476 sec - 2,048,809,794 cycles # 1.894 GHz - 3,266,667,713 instructions # 1.59 insn per cycle - 1.091634951 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.029368e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.061175e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.061175e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.413448 sec + 1,368,272,372 cycles # 3.298 GHz + 3,258,833,410 instructions # 2.38 insn per cycle + 0.415626098 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 6f564b583c..4bdc113ec5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,237 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:17:05 +DATE: 2024-03-01_19:31:10 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.665443e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.315182e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.315182e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.468201 sec - 2,060,292,715 cycles # 2.983 GHz - 3,094,906,819 instructions # 1.50 insn per cycle - 0.750075013 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.249943e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.466015e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.466015e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.882218 sec - 6,478,461,444 cycles # 3.059 GHz - 12,879,929,349 instructions # 1.99 insn per cycle - 2.174649918 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.041429e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.042536e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.042536e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.044775 sec - 24,623,818,516 cycles # 3.060 GHz - 78,132,484,739 instructions # 3.17 insn per cycle - 8.049291657 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.540494e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.541741e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.541741e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.463976 sec + 22,603,420,847 cycles # 3.496 GHz + 78,113,326,401 instructions # 3.46 insn per cycle + 6.466307048 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.498892e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.513186e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.513186e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.197009 sec - 6,464,288,620 cycles # 2.938 GHz - 20,129,426,624 instructions # 3.11 insn per cycle - 2.201352169 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.972987e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.992778e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.992778e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 +TOTAL : 1.652838 sec + 5,725,272,561 cycles # 3.461 GHz + 20,124,950,762 instructions # 3.52 insn per cycle + 1.655209946 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.703352e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.711063e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.711063e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.973161 sec - 2,827,392,405 cycles # 2.894 GHz - 6,998,075,079 instructions # 2.48 insn per cycle - 0.977561277 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.251048e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.260877e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.260877e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.736738 sec + 2,436,577,597 cycles # 3.300 GHz + 6,993,293,212 instructions # 2.87 insn per cycle + 0.739071334 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.931885e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.940835e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.940835e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.859317 sec - 2,491,742,914 cycles # 2.887 GHz - 6,305,390,293 instructions # 2.53 insn per cycle - 0.863665296 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.520306e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.532523e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.532523e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.658909 sec + 2,179,756,886 cycles # 3.300 GHz + 6,299,575,077 instructions # 2.89 insn per cycle + 0.661241811 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.551095e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.557002e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.557002e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.067932 sec - 2,057,227,059 cycles # 1.920 GHz - 3,276,345,738 instructions # 1.59 insn per cycle - 1.072312021 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.103354e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.136311e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.136311e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.408039 sec + 1,350,732,763 cycles # 3.299 GHz + 3,269,590,389 instructions # 2.42 insn per cycle + 0.410460189 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 66226e8d59..d77de8d4f6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:28:45 +DATE: 2024-03-01_19:36:07 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.308056e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.358553e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.363626e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.461299 sec - 2,006,885,691 cycles # 2.992 GHz - 3,022,532,155 instructions # 1.51 insn per cycle - 0.728549346 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.572531e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.646089e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.649338e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.795584 sec - 6,148,728,410 cycles # 3.042 GHz - 12,326,233,623 instructions # 2.00 insn per cycle - 2.078967785 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.053824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.054841e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.054841e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.539698e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.540976e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.540976e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 7.994149 sec - 24,620,138,866 cycles # 3.079 GHz - 78,125,377,108 instructions # 3.17 insn per cycle - 7.998228624 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.464734 sec + 22,602,962,065 cycles # 3.496 GHz + 78,108,873,476 instructions # 3.46 insn per cycle + 6.466917782 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.346279e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.360483e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.360483e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.925627e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.945236e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.945236e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.242069 sec - 6,461,640,731 cycles # 2.878 GHz - 20,121,052,869 instructions # 3.11 insn per cycle - 2.246196034 seconds time elapsed +TOTAL : 1.658772 sec + 5,725,250,336 cycles # 3.449 GHz + 20,115,356,658 instructions # 3.51 insn per cycle + 1.660966677 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.685316e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.692321e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.692321e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.251477e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.261264e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.261264e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.982986 sec - 2,822,415,829 cycles # 2.862 GHz - 6,987,486,660 instructions # 2.48 insn per cycle - 0.987025186 seconds time elapsed +TOTAL : 0.734787 sec + 2,429,802,465 cycles # 3.300 GHz + 6,983,714,880 instructions # 2.87 insn per cycle + 0.736976159 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.936405e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.945906e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.945906e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.521461e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.533742e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.533742e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.855808 sec - 2,484,894,865 cycles # 2.892 GHz - 6,291,816,709 instructions # 2.53 insn per cycle - 0.859867773 seconds time elapsed +TOTAL : 0.656838 sec + 2,172,470,830 cycles # 3.300 GHz + 6,289,863,881 instructions # 2.90 insn per cycle + 0.658982044 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.547512e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.553394e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.553394e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.110621e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.143899e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.143899e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.069890 sec - 2,051,026,977 cycles # 1.912 GHz - 3,263,937,559 instructions # 1.59 insn per cycle - 1.073863100 seconds time elapsed +TOTAL : 0.405468 sec + 1,341,887,518 cycles # 3.298 GHz + 3,258,430,365 instructions # 2.43 insn per cycle + 0.407620788 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index e810053300..06ac489be0 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,220 +1,138 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:25:26 +DATE: 2024-03-01_19:34:29 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.337764e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.388253e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.393743e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.460965 sec - 2,014,485,763 cycles # 3.003 GHz - 3,009,625,577 instructions # 1.49 insn per cycle - 0.728425666 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.558734e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.632343e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.635567e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.743753 sec - 6,041,672,737 cycles # 3.067 GHz - 12,221,124,809 instructions # 2.02 insn per cycle - 2.027112098 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.040104e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.041097e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.041097e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.046597 sec - 24,613,022,395 cycles # 3.060 GHz - 78,130,326,722 instructions # 3.17 insn per cycle - 8.050808561 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe: Aborted + 4,588,745 cycles # 3.254 GHz + 6,297,711 instructions # 1.37 insn per cycle + 0.038493069 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.468090e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.482424e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.482424e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.203809 sec - 6,456,229,713 cycles # 2.925 GHz - 20,119,923,968 instructions # 3.12 insn per cycle - 2.207913022 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe: Aborted + 4,649,873 cycles # 3.255 GHz + 6,330,030 instructions # 1.36 insn per cycle + 0.038267941 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.712278e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.719631e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.719631e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.965705 sec - 2,817,996,939 cycles # 2.908 GHz - 6,988,025,639 instructions # 2.48 insn per cycle - 0.969794950 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe: Aborted + 4,641,444 cycles # 3.257 GHz + 6,326,220 instructions # 1.36 insn per cycle + 0.037581414 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.924856e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.934354e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.934354e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.859873 sec - 2,483,822,785 cycles # 2.877 GHz - 6,295,526,273 instructions # 2.53 insn per cycle - 0.863979329 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe: Aborted + 4,553,782 cycles # 3.259 GHz + 6,327,532 instructions # 1.39 insn per cycle + 0.038327048 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.552387e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.558368e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.558368e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.064425 sec - 2,047,040,960 cycles # 1.917 GHz - 3,265,583,381 instructions # 1.60 insn per cycle - 1.068371519 seconds time elapsed +/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe: Aborted + 4,589,693 cycles # 3.232 GHz + 6,342,822 instructions # 1.38 insn per cycle + 0.039622291 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 29def3747b..597c41c4c4 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,223 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:22:13 +DATE: 2024-03-01_19:33:42 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.727516e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.381665e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.387640e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.463988 sec - 2,009,660,419 cycles # 2.987 GHz - 3,043,780,102 instructions # 1.51 insn per cycle - 0.732052318 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.463642e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.641012e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.644220e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.829361 sec - 6,179,090,687 cycles # 3.005 GHz - 13,497,023,724 instructions # 2.18 insn per cycle - 2.119489112 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.033662e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.034665e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.034665e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.072340 sec - 24,646,233,583 cycles # 3.055 GHz - 78,130,465,005 instructions # 3.17 insn per cycle - 8.076398723 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.541255e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.542497e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.542497e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.460363 sec + 22,594,860,702 cycles # 3.497 GHz + 78,108,705,130 instructions # 3.46 insn per cycle + 6.462497444 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.437406e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.451013e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.451013e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.213064 sec - 6,463,144,308 cycles # 2.916 GHz - 20,121,040,605 instructions # 3.11 insn per cycle - 2.217197026 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.980894e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.000089e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000089e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 +TOTAL : 1.649650 sec + 5,715,455,769 cycles # 3.462 GHz + 20,116,970,711 instructions # 3.52 insn per cycle + 1.651831111 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.690865e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.698060e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.698060e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.977816 sec - 2,816,932,981 cycles # 2.871 GHz - 6,987,870,279 instructions # 2.48 insn per cycle - 0.981891147 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.224266e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.233876e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.233876e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.743696 sec + 2,459,110,588 cycles # 3.300 GHz + 6,984,047,266 instructions # 2.84 insn per cycle + 0.745892046 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.925443e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.934689e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.934689e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.859893 sec - 2,483,713,955 cycles # 2.877 GHz - 6,295,351,555 instructions # 2.53 insn per cycle - 0.863911879 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.517139e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.529391e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.529391e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.657921 sec + 2,175,611,006 cycles # 3.300 GHz + 6,290,403,371 instructions # 2.89 insn per cycle + 0.660110597 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.552325e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.558086e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.558086e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.064299 sec - 2,046,605,748 cycles # 1.917 GHz - 3,265,707,472 instructions # 1.60 insn per cycle - 1.068273671 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.107456e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.140872e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.140872e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.405676 sec + 1,342,942,249 cycles # 3.298 GHz + 3,258,636,901 instructions # 2.43 insn per cycle + 0.407877226 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 50b444080d..2ad237dd1a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:34:14 +DATE: 2024-03-01_19:13:56 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.321381e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.374979e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.380502e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.482038 sec - 2,083,496,491 cycles # 2.987 GHz - 3,090,021,729 instructions # 1.48 insn per cycle - 0.780369869 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.505248e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.577137e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.580211e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.719742 sec - 5,952,430,615 cycles # 3.047 GHz - 11,750,571,480 instructions # 1.97 insn per cycle - 2.009992190 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.039243e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.040268e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.040268e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.050624 sec - 24,577,706,132 cycles # 3.054 GHz - 77,857,469,800 instructions # 3.17 insn per cycle - 8.057072902 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.559405e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.560665e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.560665e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.414841 sec + 22,433,654,429 cycles # 3.497 GHz + 77,753,050,419 instructions # 3.47 insn per cycle + 6.417041697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3125) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866268634797E-004 -Relative difference = 5.630135835748959e-08 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627486e-04 +Avg ME (F77/C++) = 6.6274863364631370E-004 +Relative difference = 5.076783822441729e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.236562e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.248995e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.248995e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.274363 sec - 6,415,212,085 cycles # 2.816 GHz - 20,086,390,532 instructions # 3.13 insn per cycle - 2.288238797 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.823157e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.842320e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.842320e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 +TOTAL : 1.676003 sec + 5,656,803,299 cycles # 3.372 GHz + 20,080,771,686 instructions # 3.55 insn per cycle + 1.678252022 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861465384638E-004 Relative difference = 2.211071647257023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.636656e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.643300e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.643300e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.010969 sec - 2,918,129,602 cycles # 2.878 GHz - 7,130,827,098 instructions # 2.44 insn per cycle - 1.024648825 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.186032e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.197298e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.197298e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.756528 sec + 2,501,574,254 cycles # 3.300 GHz + 7,125,249,610 instructions # 2.85 insn per cycle + 0.758780713 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271939668077068E-004 Relative difference = 5.008498817890231e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.848024e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.856123e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.856123e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.895519 sec - 2,583,274,132 cycles # 2.873 GHz - 6,439,451,842 instructions # 2.49 insn per cycle - 0.910176239 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.413858e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.425042e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.425042e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 +TOTAL : 0.685713 sec + 2,267,843,391 cycles # 3.300 GHz + 6,433,455,899 instructions # 2.84 insn per cycle + 0.687939686 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271939668077068E-004 Relative difference = 5.008498817890231e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.488982e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.494377e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.494377e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.109477 sec - 2,120,739,457 cycles # 1.905 GHz - 3,428,489,642 instructions # 1.62 insn per cycle - 1.120804955 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.907924e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.937842e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.937842e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.426031 sec + 1,409,552,540 cycles # 3.297 GHz + 3,420,395,875 instructions # 2.43 insn per cycle + 0.428204579 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2911) (512y: 22) (512z: 9647) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952032322112E-004 Relative difference = 3.066639970473621e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 3e610d68fd..146b66385f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:07:56 +DATE: 2024-03-01_19:26:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.548079e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.594396e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.599390e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.487762 sec - 2,117,397,644 cycles # 2.979 GHz - 3,170,491,357 instructions # 1.50 insn per cycle - 0.771619877 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.728616e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.789567e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.792128e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.852993 sec - 6,403,206,858 cycles # 3.066 GHz - 13,984,822,985 instructions # 2.18 insn per cycle - 2.145838793 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.747654e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.748466e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.748466e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.541681 sec - 87,683,123,741 cycles # 3.072 GHz - 135,626,627,328 instructions # 1.55 insn per cycle - 28.545959109 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15563) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.142872e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.143601e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.143601e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204932e-01 +- 3.252405e-01 ) GeV^-4 +TOTAL : 26.704464 sec + 93,392,949,985 cycles # 3.497 GHz + 135,135,739,845 instructions # 1.45 insn per cycle + 26.706712729 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15558) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275340277317796E-004 -Relative difference = 4.184328521943034e-09 +Avg ME (F77/C++) = 6.6275340237237357E-004 +Relative difference = 3.579572077573998e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.148984e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.161699e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.161699e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.302428 sec - 6,776,067,855 cycles # 2.939 GHz - 19,386,467,667 instructions # 2.86 insn per cycle - 2.306810458 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.661989e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.676894e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.676894e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 1.900144 sec + 6,560,267,417 cycles # 3.450 GHz + 19,382,075,169 instructions # 2.95 insn per cycle + 1.902414328 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274862707273868E-004 Relative difference = 4.0849182767952624e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.506728e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.512574e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.512574e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.096393 sec - 3,175,310,502 cycles # 2.890 GHz - 6,807,675,147 instructions # 2.14 insn per cycle - 1.100557110 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.751056e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.757067e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.757067e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 0.943073 sec + 3,117,335,618 cycles # 3.300 GHz + 6,803,703,724 instructions # 2.18 insn per cycle + 0.945302019 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731558747466E-004 Relative difference = 2.3520194007978538e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.815661e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.823746e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.823746e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.911313 sec - 2,641,911,907 cycles # 2.888 GHz - 5,985,989,672 instructions # 2.27 insn per cycle - 0.915610697 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.112187e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.120841e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.120841e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 0.782995 sec + 2,589,201,691 cycles # 3.300 GHz + 5,981,113,539 instructions # 2.31 insn per cycle + 0.785295392 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731558747466E-004 Relative difference = 2.3520194007978538e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.523255e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.528884e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.528884e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.084772 sec - 2,074,111,548 cycles # 1.906 GHz - 3,500,542,355 instructions # 1.69 insn per cycle - 1.089027435 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.297217e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.318515e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.318515e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211848e-01 +- 3.254639e-01 ) GeV^-4 +TOTAL : 0.503966 sec + 1,665,820,543 cycles # 3.296 GHz + 3,493,880,654 instructions # 2.10 insn per cycle + 0.506230823 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5197) (512y: 3) (512z:44822) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 Avg ME (F77/C++) = 6.6272750363879224E-004 Relative difference = 5.490631193034436e-09 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index f668536073..dd21065806 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_03:08:48 +DATE: 2024-03-01_19:26:49 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.541557e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.588429e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.593399e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.485011 sec - 2,123,544,393 cycles # 3.007 GHz - 3,219,525,664 instructions # 1.52 insn per cycle - 0.766064420 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.637487e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.696462e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.698981e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.858325 sec - 6,401,876,626 cycles # 3.056 GHz - 13,834,352,039 instructions # 2.16 insn per cycle - 2.151127842 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.762616e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.763465e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.763465e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.469746 sec - 87,566,965,728 cycles # 3.076 GHz - 135,909,521,186 instructions # 1.55 insn per cycle - 28.473960910 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15910) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 6.387508e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.388300e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.388300e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204932e-01 +- 3.252405e-01 ) GeV^-4 +TOTAL : 25.683491 sec + 89,826,358,356 cycles # 3.497 GHz + 135,208,960,358 instructions # 1.51 insn per cycle + 25.685677569 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15804) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275352674967369E-004 -Relative difference = 4.0361421941458736e-08 +Avg ME (F77/C++) = 6.6275352716470975E-004 +Relative difference = 4.098765184605283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.141246e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.153468e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.153468e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.304055 sec - 6,854,008,563 cycles # 2.972 GHz - 19,438,508,034 instructions # 2.84 insn per cycle - 2.308246423 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.437796e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.452057e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.452057e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 1.950080 sec + 6,654,162,703 cycles # 3.410 GHz + 19,433,960,536 instructions # 2.92 insn per cycle + 1.952366253 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274862764021530E-004 Relative difference = 4.170542995014107e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.543089e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.548736e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.548736e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.070827 sec - 3,111,432,280 cycles # 2.896 GHz - 6,718,585,544 instructions # 2.16 insn per cycle - 1.075017514 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.786843e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.793023e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.793023e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 0.924113 sec + 3,054,845,902 cycles # 3.300 GHz + 6,714,445,014 instructions # 2.20 insn per cycle + 0.926394819 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731651051409E-004 Relative difference = 2.4912983202981302e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.837542e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.845711e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.845711e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.900474 sec - 2,630,752,588 cycles # 2.910 GHz - 5,969,340,561 instructions # 2.27 insn per cycle - 0.904647261 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.132201e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.140940e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.140940e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 0.775532 sec + 2,564,183,121 cycles # 3.300 GHz + 5,963,951,119 instructions # 2.33 insn per cycle + 0.777744409 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731651051409E-004 Relative difference = 2.4912983202981302e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.526039e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.531935e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.531935e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.083027 sec - 2,083,719,160 cycles # 1.918 GHz - 3,494,111,175 instructions # 1.68 insn per cycle - 1.087325959 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.326443e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.348069e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.348069e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211848e-01 +- 3.254639e-01 ) GeV^-4 +TOTAL : 0.499522 sec + 1,651,245,664 cycles # 3.296 GHz + 3,487,405,377 instructions # 2.11 insn per cycle + 0.501723648 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4161) (512y: 4) (512z:44465) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 Avg ME (F77/C++) = 6.6272750384530066E-004 Relative difference = 5.80223501432476e-09 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 8553820a52..f9ab256fce 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:34:44 +DATE: 2024-03-01_19:14:11 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.473478e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.502235e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.504525e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.522907 sec - 2,248,416,129 cycles # 2.981 GHz - 3,483,881,112 instructions # 1.55 insn per cycle - 0.829467781 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.123898e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.157734e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.159130e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.035491 sec - 10,039,386,860 cycles # 3.052 GHz - 22,522,898,713 instructions # 2.24 insn per cycle - 3.349083086 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266732376103494E-004 -Relative difference = 2.659538381540814e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.952639e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.953615e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.953615e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.409354 sec - 25,927,870,734 cycles # 3.082 GHz - 79,436,480,305 instructions # 3.06 insn per cycle - 8.416137774 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.460441e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.461666e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.461666e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.673146 sec + 23,335,676,255 cycles # 3.496 GHz + 79,277,751,932 instructions # 3.40 insn per cycle + 6.675453883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4801) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.739028e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.742372e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.742372e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.395641 sec - 12,641,926,900 cycles # 2.873 GHz - 38,549,360,435 instructions # 3.05 insn per cycle - 4.411574958 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.686446e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.690711e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.690711e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.506769 sec + 11,577,360,778 cycles # 3.300 GHz + 38,543,055,165 instructions # 3.33 insn per cycle + 3.509025941 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.720558e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.737987e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.737987e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.889905 sec - 5,503,418,397 cycles # 2.905 GHz - 13,481,227,468 instructions # 2.45 insn per cycle - 1.901949052 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.172980e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.175482e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.175482e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.405389 sec + 4,642,988,163 cycles # 3.300 GHz + 13,474,033,843 instructions # 2.90 insn per cycle + 1.407688497 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.817789e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.841302e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.841302e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.679659 sec - 4,858,057,374 cycles # 2.885 GHz - 12,135,455,571 instructions # 2.50 insn per cycle - 1.694768152 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.284064e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.287100e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.287100e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.284493 sec + 4,244,001,962 cycles # 3.300 GHz + 12,130,560,495 instructions # 2.86 insn per cycle + 1.286759541 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.171224e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.183880e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.183880e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.297248 sec - 4,143,595,621 cycles # 1.801 GHz - 6,336,694,490 instructions # 1.53 insn per cycle - 2.312628428 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.007453e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.014936e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.014936e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.824364 sec + 2,723,570,767 cycles # 3.298 GHz + 6,306,786,489 instructions # 2.32 insn per cycle + 0.826694116 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 44d560fb63..0e2a0168d0 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_02:35:21 +DATE: 2024-03-01_19:14:29 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.474402e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.502829e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.505143e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.522485 sec - 2,266,664,443 cycles # 3.011 GHz - 3,552,942,464 instructions # 1.57 insn per cycle - 0.824080628 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.147340e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181695e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.182993e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.023944 sec - 10,029,910,184 cycles # 3.059 GHz - 21,497,951,661 instructions # 2.14 insn per cycle - 3.338904131 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266732376103494E-004 -Relative difference = 2.659538381540814e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.924823e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.925747e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.925747e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.531114 sec - 25,939,606,781 cycles # 3.040 GHz - 79,447,311,630 instructions # 3.06 insn per cycle - 8.537643841 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.465973e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.467206e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.467206e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 6.658248 sec + 23,291,569,184 cycles # 3.498 GHz + 79,237,899,390 instructions # 3.40 insn per cycle + 6.660539391 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4465) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.758654e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.761985e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.761985e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.372440 sec - 12,693,692,693 cycles # 2.901 GHz - 38,521,475,204 instructions # 3.03 insn per cycle - 4.385193423 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.697632e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.701963e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.701963e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.498479 sec + 11,550,565,484 cycles # 3.300 GHz + 38,513,812,102 instructions # 3.33 insn per cycle + 3.500917728 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.635318e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.652109e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.652109e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.908191 sec - 5,531,901,200 cycles # 2.893 GHz - 13,605,961,475 instructions # 2.46 insn per cycle - 1.920337987 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.152534e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154954e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.154954e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.430198 sec + 4,725,017,333 cycles # 3.300 GHz + 13,599,869,443 instructions # 2.88 insn per cycle + 1.432524334 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.704499e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.725961e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.725961e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.699452 sec - 4,910,284,170 cycles # 2.883 GHz - 12,271,024,564 instructions # 2.50 insn per cycle - 1.712563313 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.271343e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.274342e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274342e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.297199 sec + 4,286,083,032 cycles # 3.300 GHz + 12,263,686,047 instructions # 2.86 insn per cycle + 1.299546744 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.567240e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.580886e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.580886e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.177959 sec - 4,164,411,217 cycles # 1.910 GHz - 6,442,301,345 instructions # 1.55 insn per cycle - 2.190574077 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.010301e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.017802e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.017802e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.823170 sec + 2,718,854,468 cycles # 3.297 GHz + 6,413,166,633 instructions # 2.36 insn per cycle + 0.825525653 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 93119c7539..3e8c727bbe 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:37:42 +DATE: 2024-03-01_19:15:32 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.065457e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.065836e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.065940e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.448496 sec - 8,082,390,398 cycles # 2.946 GHz - 16,852,562,382 instructions # 2.09 insn per cycle - 2.848455369 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.245006e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.247251e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.247453e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.002127 sec - 13,348,526,839 cycles # 3.088 GHz - 31,140,905,358 instructions # 2.33 insn per cycle - 4.382097820 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.053587e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.053836e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.053836e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.566168 sec - 18,831,689,747 cycles # 2.868 GHz - 53,916,332,004 instructions # 2.86 insn per cycle - 6.572689464 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.007917e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.007943e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.007943e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.242575 sec + 18,324,214,896 cycles # 3.494 GHz + 53,656,205,350 instructions # 2.93 insn per cycle + 5.244757907 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32534) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.663489e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.663581e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.663581e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.182674 sec - 9,806,871,766 cycles # 3.081 GHz - 27,093,022,297 instructions # 2.76 insn per cycle - 3.192772007 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.907110e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.907199e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.907199e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.771308 sec + 9,690,965,867 cycles # 3.495 GHz + 27,085,067,925 instructions # 2.79 insn per cycle + 2.773516868 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.630162e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.630605e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.630605e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.462430 sec - 4,231,767,010 cycles # 2.892 GHz - 9,562,001,834 instructions # 2.26 insn per cycle - 1.472832936 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.253444e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.253870e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.253870e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.243280 sec + 4,107,456,413 cycles # 3.300 GHz + 9,554,437,400 instructions # 2.33 insn per cycle + 1.245464164 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.135973e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.136556e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.136556e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.282131 sec - 3,734,243,960 cycles # 2.905 GHz - 8,486,594,514 instructions # 2.27 insn per cycle - 1.294140643 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.901436e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.902024e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.902024e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.079126 sec + 3,565,937,704 cycles # 3.300 GHz + 8,478,433,637 instructions # 2.38 insn per cycle + 1.081341215 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.702281e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.702851e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.702851e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.432645 sec - 2,701,519,987 cycles # 1.882 GHz - 4,274,080,381 instructions # 1.58 insn per cycle - 1.444722496 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.570541e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.572308e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.572308e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 0.618837 sec + 2,038,410,234 cycles # 3.287 GHz + 4,264,215,045 instructions # 2.09 insn per cycle + 0.620988332 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 7163808f45..898ee4858d 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,237 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_03:17:34 +DATE: 2024-03-01_19:31:25 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.068445e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.069395e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.069395e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.373786 sec - 8,212,794,649 cycles # 3.050 GHz - 17,373,508,782 instructions # 2.12 insn per cycle - 2.749788140 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.191805e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.223957e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.223957e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.992060 sec - 13,207,906,873 cycles # 3.062 GHz - 30,525,969,027 instructions # 2.31 insn per cycle - 4.371813741 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.148706e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.148931e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.148931e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.484661 sec - 18,737,465,302 cycles # 2.888 GHz - 53,915,906,594 instructions # 2.88 insn per cycle - 6.488680620 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.008044e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.008070e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.008070e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.240399 sec + 18,320,301,067 cycles # 3.495 GHz + 53,657,376,785 instructions # 2.93 insn per cycle + 5.242561648 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32534) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.664837e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.664944e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.664944e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.177972 sec - 9,794,551,146 cycles # 3.079 GHz - 27,093,049,280 instructions # 2.77 insn per cycle - 3.182112356 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.907377e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.907467e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.907467e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.770516 sec + 9,697,495,185 cycles # 3.498 GHz + 27,086,113,073 instructions # 2.79 insn per cycle + 2.772701601 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.541461e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.541883e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.541883e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.495047 sec - 4,300,282,840 cycles # 2.870 GHz - 9,561,701,370 instructions # 2.22 insn per cycle - 1.499121189 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.249825e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.250273e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.250273e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.244792 sec + 4,112,966,387 cycles # 3.300 GHz + 9,555,785,054 instructions # 2.32 insn per cycle + 1.246985323 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.118490e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.119048e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.119048e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.287264 sec - 3,730,461,014 cycles # 2.891 GHz - 8,485,603,542 instructions # 2.27 insn per cycle - 1.291227222 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.901276e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.901882e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.901882e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.079264 sec + 3,566,506,063 cycles # 3.300 GHz + 8,479,328,463 instructions # 2.38 insn per cycle + 1.081393203 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.742786e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.743427e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.743427e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.415968 sec - 2,690,639,160 cycles # 1.896 GHz - 4,273,336,878 instructions # 1.59 insn per cycle - 1.420067464 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.552403e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.554214e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.554214e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 0.620126 sec + 2,047,589,081 cycles # 3.294 GHz + 4,264,912,626 instructions # 2.08 insn per cycle + 0.622279806 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index fcaae9673e..6a223c1182 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:38:46 +DATE: 2024-03-01_19:15:57 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.066781e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.067205e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.067339e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.446944 sec - 8,408,759,874 cycles # 3.068 GHz - 18,673,492,162 instructions # 2.22 insn per cycle - 2.843675081 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.258123e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.260337e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.260588e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.986190 sec - 13,309,313,958 cycles # 3.084 GHz - 29,253,936,467 instructions # 2.20 insn per cycle - 4.370982628 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.505940e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.506196e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.506196e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.219195 sec - 18,809,079,145 cycles # 3.025 GHz - 53,925,834,666 instructions # 2.87 insn per cycle - 6.232860023 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.978714e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.978965e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.978965e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.293741 sec + 18,509,726,309 cycles # 3.496 GHz + 53,668,966,356 instructions # 2.90 insn per cycle + 5.295890020 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32178) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.661174e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.661266e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.661266e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.189478 sec - 9,805,870,159 cycles # 3.076 GHz - 27,091,831,447 instructions # 2.76 insn per cycle - 3.203897537 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.900218e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.900307e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.900307e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.781026 sec + 9,726,233,315 cycles # 3.496 GHz + 27,082,128,443 instructions # 2.78 insn per cycle + 2.783220311 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.622791e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.623217e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.623217e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.464714 sec - 4,224,699,489 cycles # 2.882 GHz - 9,562,401,622 instructions # 2.26 insn per cycle - 1.476328883 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.269104e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.269539e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.269539e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.238661 sec + 4,092,373,675 cycles # 3.300 GHz + 9,554,221,469 instructions # 2.33 insn per cycle + 1.240854013 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.104704e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.105332e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.105332e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.294499 sec - 3,723,740,700 cycles # 2.874 GHz - 8,486,051,495 instructions # 2.28 insn per cycle - 1.308410916 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.901596e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.902173e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.902173e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.079028 sec + 3,565,293,300 cycles # 3.300 GHz + 8,478,411,533 instructions # 2.38 insn per cycle + 1.081182403 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.737812e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.738457e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.738457e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.421818 sec - 2,699,411,216 cycles # 1.899 GHz - 4,277,531,970 instructions # 1.58 insn per cycle - 1.435104148 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.548194e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.549944e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.549944e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 0.620448 sec + 2,042,307,043 cycles # 3.284 GHz + 4,267,534,527 instructions # 2.09 insn per cycle + 0.622543731 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index e89ab34326..4969158cd6 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:39:49 +DATE: 2024-03-01_19:16:21 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.768224e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.769082e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.769342e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.706494 sec - 5,724,877,835 cycles # 2.946 GHz - 11,350,286,337 instructions # 1.98 insn per cycle - 2.064496697 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.316243e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.317022e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.317120e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.926202 sec - 6,794,636,243 cycles # 3.076 GHz - 13,931,883,029 instructions # 2.05 insn per cycle - 2.265774235 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.967764e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.968029e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.968029e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.899633 sec - 18,012,008,843 cycles # 3.055 GHz - 53,588,806,253 instructions # 2.98 insn per cycle - 5.906269981 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.040188e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.040217e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.040217e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 5.079204 sec + 17,758,838,927 cycles # 3.495 GHz + 53,421,541,162 instructions # 3.01 insn per cycle + 5.081388085 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20403) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087545108E-003 +Relative difference = 2.11977393295785e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.554445e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.554907e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.554907e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.492504 sec - 4,596,969,768 cycles # 3.077 GHz - 13,763,413,131 instructions # 2.99 insn per cycle - 1.508036951 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.216869e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.217343e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.217343e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924321e-03 +- 4.918774e-03 ) GeV^-6 +TOTAL : 1.254841 sec + 4,386,128,209 cycles # 3.492 GHz + 13,755,443,722 instructions # 3.14 insn per cycle + 1.257011070 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896527003E-003 Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.129307e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.130988e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.130988e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.749250 sec - 2,146,538,234 cycles # 2.864 GHz - 4,817,770,938 instructions # 2.24 insn per cycle - 0.763621351 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.433176e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.435080e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.435080e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.628179 sec + 2,077,257,351 cycles # 3.300 GHz + 4,810,210,256 instructions # 2.32 insn per cycle + 0.630303522 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.184924e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.187225e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.187225e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.652928 sec - 1,865,233,671 cycles # 2.849 GHz - 4,274,819,205 instructions # 2.29 insn per cycle - 0.666710238 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.775402e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.777862e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.777862e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.542106 sec + 1,793,648,403 cycles # 3.300 GHz + 4,267,307,526 instructions # 2.38 insn per cycle + 0.544249464 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.469221e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.471533e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.471533e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.715424 sec - 1,360,172,621 cycles # 1.900 GHz - 2,159,744,323 instructions # 1.59 insn per cycle - 0.729957103 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.719714e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.720497e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.720497e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946834e-03 +- 4.941266e-03 ) GeV^-6 +TOTAL : 0.309831 sec + 1,024,905,660 cycles # 3.295 GHz + 2,150,921,588 instructions # 2.10 insn per cycle + 0.312019958 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982958280E-003 Relative difference = 2.0044092642523172e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 684ca24c1f..1862b6e1e8 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,237 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_03:18:37 +DATE: 2024-03-01_19:31:49 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.798857e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.800593e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.800593e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.598425 sec - 5,724,594,753 cycles # 3.063 GHz - 12,186,790,592 instructions # 2.13 insn per cycle - 1.928350107 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.285950e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.298387e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.298387e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.887489 sec - 6,620,617,732 cycles # 3.045 GHz - 14,303,245,528 instructions # 2.16 insn per cycle - 2.231962749 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.094412e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.094687e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.094687e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.812831 sec - 17,931,583,834 cycles # 3.083 GHz - 53,588,775,363 instructions # 2.99 insn per cycle - 5.816760256 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.041119e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041148e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.041148e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 5.074721 sec + 17,746,301,498 cycles # 3.497 GHz + 53,422,248,318 instructions # 3.01 insn per cycle + 5.076826933 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20403) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087545108E-003 +Relative difference = 2.11977393295785e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.573130e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.573569e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.573569e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.483014 sec - 4,585,157,051 cycles # 3.085 GHz - 13,762,636,955 instructions # 3.00 insn per cycle - 1.487033664 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.218426e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.218902e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.218902e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924321e-03 +- 4.918774e-03 ) GeV^-6 +TOTAL : 1.254452 sec + 4,388,322,961 cycles # 3.494 GHz + 13,756,441,763 instructions # 3.13 insn per cycle + 1.256579192 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896527003E-003 Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.234993e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.236702e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.236702e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.734407 sec - 2,124,324,714 cycles # 2.880 GHz - 4,817,114,861 instructions # 2.27 insn per cycle - 0.738469635 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.453965e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.455916e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.455916e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.626630 sec + 2,072,617,839 cycles # 3.300 GHz + 4,811,160,652 instructions # 2.32 insn per cycle + 0.628769204 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.746826e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.748881e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.748881e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.686036 sec - 1,868,608,359 cycles # 2.710 GHz - 4,274,464,507 instructions # 2.29 insn per cycle - 0.690085324 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.777854e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.780436e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.780436e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.541994 sec + 1,793,603,060 cycles # 3.300 GHz + 4,268,281,709 instructions # 2.38 insn per cycle + 0.544156652 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.587479e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.589999e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.589999e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.701778 sec - 1,356,865,477 cycles # 1.924 GHz - 2,159,196,207 instructions # 1.59 insn per cycle - 0.705773287 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.717939e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.718720e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.718720e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946834e-03 +- 4.941266e-03 ) GeV^-6 +TOTAL : 0.310080 sec + 1,025,575,344 cycles # 3.292 GHz + 2,151,895,701 instructions # 2.10 insn per cycle + 0.312200294 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982958280E-003 Relative difference = 2.0044092642523172e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 2af18ad9d5..0d3fb1c86c 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:40:36 +DATE: 2024-03-01_19:16:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.765595e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.766455e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.766757e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.693781 sec - 5,858,518,501 cycles # 3.029 GHz - 12,487,165,720 instructions # 2.13 insn per cycle - 2.044833380 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.312075e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.312852e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.312969e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.933893 sec - 6,737,061,424 cycles # 3.047 GHz - 14,801,104,127 instructions # 2.20 insn per cycle - 2.267780802 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.922433e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.922702e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.922702e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.925615 sec - 17,989,215,363 cycles # 3.036 GHz - 53,579,777,630 instructions # 2.98 insn per cycle - 5.931642569 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.032406e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.032434e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.032434e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 5.116727 sec + 17,886,024,738 cycles # 3.495 GHz + 53,426,547,569 instructions # 2.99 insn per cycle + 5.118914313 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20415) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087582491E-003 -Relative difference = 2.1198118933954545e-08 +Avg ME (F77/C++) = 9.8479612087257751E-003 +Relative difference = 2.119482139617284e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.564689e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.565144e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.565144e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.490731 sec - 4,558,556,123 cycles # 3.055 GHz - 13,757,084,226 instructions # 3.02 insn per cycle - 1.501811120 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.256894e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.257370e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.257370e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924321e-03 +- 4.918774e-03 ) GeV^-6 +TOTAL : 1.243013 sec + 4,349,759,961 cycles # 3.495 GHz + 13,748,864,713 instructions # 3.16 insn per cycle + 1.245159087 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896225560E-003 Relative difference = 3.151694379513441e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.177084e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.178836e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.178836e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.743943 sec - 2,139,817,263 cycles # 2.875 GHz - 4,819,936,629 instructions # 2.25 insn per cycle - 0.755587883 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.499112e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.501031e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.501031e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.623063 sec + 2,060,898,442 cycles # 3.300 GHz + 4,812,404,156 instructions # 2.34 insn per cycle + 0.625176375 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070967E-003 Relative difference = 1.8588234562202478e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.229829e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.232369e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.232369e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.647666 sec - 1,869,906,105 cycles # 2.875 GHz - 4,276,791,956 instructions # 2.29 insn per cycle - 0.664053491 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.724133e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.726548e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.726548e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.544853 sec + 1,802,924,271 cycles # 3.300 GHz + 4,269,301,578 instructions # 2.37 insn per cycle + 0.547035634 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070967E-003 Relative difference = 1.8588234562202478e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.437378e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.439646e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.439646e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.718650 sec - 1,366,457,842 cycles # 1.901 GHz - 2,166,062,692 instructions # 1.59 insn per cycle - 0.731356674 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.697678e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.698411e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.698411e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946834e-03 +- 4.941266e-03 ) GeV^-6 +TOTAL : 0.313529 sec + 1,032,832,833 cycles # 3.280 GHz + 2,157,258,957 instructions # 2.09 insn per cycle + 0.315697429 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3474) (512y: 34) (512z:79492) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982955140E-003 Relative difference = 2.0044060904369713e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index c639834643..8c752511ca 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:41:23 +DATE: 2024-03-01_19:17:00 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.691286e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.691795e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.691928e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.198692 sec - 7,604,134,018 cycles # 3.054 GHz - 16,321,512,266 instructions # 2.15 insn per cycle - 2.594812497 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.112457e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112776e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112803e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.397194 sec - 11,475,121,938 cycles # 3.084 GHz - 26,000,925,285 instructions # 2.27 insn per cycle - 3.777191130 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.034566e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.034790e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.034790e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.578920 sec - 19,096,747,933 cycles # 2.903 GHz - 54,154,360,803 instructions # 2.84 insn per cycle - 6.585797711 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.888048e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.888295e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.888295e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.342426 sec + 18,680,140,983 cycles # 3.496 GHz + 53,894,914,129 instructions # 2.89 insn per cycle + 5.344498912 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32196) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634173e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.634271e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.634271e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.239396 sec - 9,369,032,238 cycles # 2.892 GHz - 26,160,172,444 instructions # 2.79 insn per cycle - 3.251135271 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.973916e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.974013e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.974013e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.677279 sec + 9,359,247,427 cycles # 3.494 GHz + 26,144,863,285 instructions # 2.79 insn per cycle + 2.679446649 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.697087e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.697545e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.697545e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.438333 sec - 4,079,178,507 cycles # 2.840 GHz - 9,228,646,226 instructions # 2.26 insn per cycle - 1.450605350 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.494886e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.495366e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.495366e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.176795 sec + 3,888,164,225 cycles # 3.300 GHz + 9,214,766,456 instructions # 2.37 insn per cycle + 1.178930781 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.363646e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.364393e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.364393e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.218747 sec - 3,509,445,956 cycles # 2.879 GHz - 8,176,263,750 instructions # 2.33 insn per cycle - 1.230057623 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 5.132578e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.133212e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.133212e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.030734 sec + 3,406,124,705 cycles # 3.300 GHz + 8,162,197,379 instructions # 2.40 insn per cycle + 1.032894212 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.850358e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.851005e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.851005e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.381042 sec - 2,620,845,167 cycles # 1.898 GHz - 4,155,618,865 instructions # 1.59 insn per cycle - 1.395419124 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.830953e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.832825e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.832825e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 0.600646 sec + 1,980,221,274 cycles # 3.289 GHz + 4,145,631,132 instructions # 2.09 insn per cycle + 0.602788892 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index ace04f97d7..04dd6516f8 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_02:42:25 +DATE: 2024-03-01_19:17:24 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.691636e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.692217e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.692361e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.171682 sec - 7,616,890,265 cycles # 3.058 GHz - 16,356,089,453 instructions # 2.15 insn per cycle - 2.553555988 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.106871e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.107188e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107217e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.406322 sec - 11,260,210,288 cycles # 3.017 GHz - 25,906,087,343 instructions # 2.30 insn per cycle - 3.788413520 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.951672e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.951882e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.951882e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.644473 sec - 19,262,229,911 cycles # 2.898 GHz - 54,152,472,780 instructions # 2.81 insn per cycle - 6.648593616 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 9.833294e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.833543e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.833543e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.373246 sec + 18,782,693,778 cycles # 3.495 GHz + 53,895,950,177 instructions # 2.87 insn per cycle + 5.375414541 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32348) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.623003e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.623092e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.623092e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.257928 sec - 9,349,757,536 cycles # 2.867 GHz - 26,077,919,393 instructions # 2.79 insn per cycle - 3.270643449 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.984004e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.984101e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.984101e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.663925 sec + 9,315,048,345 cycles # 3.495 GHz + 26,065,146,110 instructions # 2.80 insn per cycle + 2.666137308 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.760154e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.760626e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.760626e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.408906 sec - 4,059,558,991 cycles # 2.874 GHz - 9,213,876,384 instructions # 2.27 insn per cycle - 1.420092908 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.435411e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.435873e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.435873e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.192345 sec + 3,939,476,607 cycles # 3.300 GHz + 9,201,069,306 instructions # 2.34 insn per cycle + 1.194498730 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.304001e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.304638e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.304638e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.231479 sec - 3,558,951,872 cycles # 2.881 GHz - 8,168,148,330 instructions # 2.30 insn per cycle - 1.241837128 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 5.088546e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.089160e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.089160e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.039445 sec + 3,434,917,766 cycles # 3.300 GHz + 8,155,176,688 instructions # 2.37 insn per cycle + 1.041616665 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.836982e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.837574e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.837574e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.381601 sec - 2,619,896,392 cycles # 1.892 GHz - 4,153,497,129 instructions # 1.59 insn per cycle - 1.390536918 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.811569e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.813385e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.813385e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 0.601983 sec + 1,984,653,054 cycles # 3.289 GHz + 4,144,401,294 instructions # 2.09 insn per cycle + 0.604132820 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 4f705cbffa..2cec001807 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:35:57 +DATE: 2024-03-01_19:14:48 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.695225e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.365990e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.743234e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.446213 sec - 1,972,017,701 cycles # 2.992 GHz - 2,778,256,208 instructions # 1.41 insn per cycle - 0.734930275 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.267244e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.134450e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.554945e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.528224 sec - 2,304,762,750 cycles # 3.008 GHz - 3,294,040,641 instructions # 1.43 insn per cycle - 0.823439197 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.091452e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.114280e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.114280e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.522856 sec - 4,703,604,569 cycles # 3.081 GHz - 13,462,460,024 instructions # 2.86 insn per cycle - 1.529442917 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.390238e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.419308e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.419308e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.204060 sec + 4,213,611,842 cycles # 3.495 GHz + 13,434,873,181 instructions # 3.19 insn per cycle + 1.206434287 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 864) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499484 Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.951069e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.025448e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.025448e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.861454 sec - 2,622,516,081 cycles # 3.029 GHz - 7,553,226,055 instructions # 2.88 insn per cycle - 0.875162721 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.533543e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.629343e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.629343e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.671073 sec + 2,349,549,529 cycles # 3.493 GHz + 7,542,124,614 instructions # 3.21 insn per cycle + 0.673394223 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499478 Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.378326e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.598362e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.598362e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.506903 sec - 1,479,878,074 cycles # 2.896 GHz - 3,120,545,502 instructions # 2.11 insn per cycle - 0.521612120 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.512664e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.794644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.794644e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.387185 sec + 1,289,012,082 cycles # 3.315 GHz + 3,105,638,031 instructions # 2.41 insn per cycle + 0.389505293 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.763846e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.033394e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.033394e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.456990 sec - 1,342,026,946 cycles # 2.909 GHz - 2,982,806,139 instructions # 2.22 insn per cycle - 0.473253864 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.739873e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.054260e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.054260e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.370616 sec + 1,234,624,013 cycles # 3.316 GHz + 2,961,729,590 instructions # 2.40 insn per cycle + 0.372993568 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.552530e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.674072e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.674072e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.665523 sec - 1,326,336,546 cycles # 1.981 GHz - 1,954,248,677 instructions # 1.47 insn per cycle - 0.676015017 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 5.362372e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.769155e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.769155e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.330642 sec + 1,102,117,372 cycles # 3.316 GHz + 1,932,728,355 instructions # 1.75 insn per cycle + 0.333111717 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 7838899130..bb3a843a97 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,237 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_03:15:54 +DATE: 2024-03-01_19:30:37 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.566228e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.132243e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.132243e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.471075 sec - 2,051,009,542 cycles # 3.009 GHz - 3,055,349,974 instructions # 1.49 insn per cycle - 0.738770181 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.288005e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.253544e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.253544e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.748132 sec - 3,046,262,026 cycles # 3.023 GHz - 4,636,082,832 instructions # 1.52 insn per cycle - 1.065675268 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.089966e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112868e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112868e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.529900 sec - 4,728,814,715 cycles # 3.083 GHz - 13,467,526,764 instructions # 2.85 insn per cycle - 1.534252544 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.391136e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.420435e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.420435e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.207326 sec + 4,222,941,203 cycles # 3.492 GHz + 13,442,631,396 instructions # 3.18 insn per cycle + 1.209898020 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 864) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499484 Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.949285e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.024056e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.024056e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.869004 sec - 2,652,875,861 cycles # 3.039 GHz - 7,602,145,003 instructions # 2.87 insn per cycle - 0.873736497 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.522907e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.617976e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.617976e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.678477 sec + 2,374,544,352 cycles # 3.491 GHz + 7,592,390,633 instructions # 3.20 insn per cycle + 0.680996171 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499478 Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.146841e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.351542e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.351542e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.550316 sec - 1,514,222,662 cycles # 2.732 GHz - 3,170,467,422 instructions # 2.09 insn per cycle - 0.554802806 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.487640e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.768436e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.768436e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.394135 sec + 1,312,471,214 cycles # 3.316 GHz + 3,157,137,064 instructions # 2.41 insn per cycle + 0.396629875 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.650572e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.918840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.918840e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.478096 sec - 1,374,122,120 cycles # 2.850 GHz - 3,032,631,270 instructions # 2.21 insn per cycle - 0.482825268 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.703184e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.017690e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.017690e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.378206 sec + 1,260,283,982 cycles # 3.317 GHz + 3,011,528,674 instructions # 2.39 insn per cycle + 0.380880323 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.537453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.662993e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.662993e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.675099 sec - 1,354,490,621 cycles # 1.996 GHz - 1,991,409,834 instructions # 1.47 insn per cycle - 0.679620955 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 5.332880e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.727962e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.727962e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.336607 sec + 1,122,764,946 cycles # 3.319 GHz + 1,968,890,958 instructions # 1.75 insn per cycle + 0.338989281 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 1de3a7df55..72f1443440 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:36:15 +DATE: 2024-03-01_19:14:56 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.634258e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.200936e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.553712e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.443315 sec - 2,012,981,464 cycles # 3.013 GHz - 2,802,025,362 instructions # 1.39 insn per cycle - 0.744859677 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.239420e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.026633e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.428795e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.526694 sec - 2,300,725,267 cycles # 3.007 GHz - 3,244,738,845 instructions # 1.41 insn per cycle - 0.822736768 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.093034e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.115683e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.115683e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.520645 sec - 4,710,102,553 cycles # 3.090 GHz - 13,456,334,828 instructions # 2.86 insn per cycle - 1.527404362 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.382004e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.410796e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.410796e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.210873 sec + 4,235,605,729 cycles # 3.494 GHz + 13,439,420,817 instructions # 3.17 insn per cycle + 1.213116092 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 853) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499484 Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.995699e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.070809e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.070809e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.841713 sec - 2,618,818,041 cycles # 3.096 GHz - 7,552,217,415 instructions # 2.88 insn per cycle - 0.854217946 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.515723e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.608916e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.608916e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.675572 sec + 2,364,775,142 cycles # 3.493 GHz + 7,541,520,259 instructions # 3.19 insn per cycle + 0.677865882 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499478 Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.378534e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.594400e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.594400e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.506766 sec - 1,482,977,233 cycles # 2.909 GHz - 3,119,381,568 instructions # 2.10 insn per cycle - 0.519705447 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.519149e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.803981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.803981e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.386496 sec + 1,286,321,868 cycles # 3.315 GHz + 3,104,502,696 instructions # 2.41 insn per cycle + 0.388829317 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.757237e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.033602e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.033602e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.457488 sec - 1,337,095,985 cycles # 2.896 GHz - 2,979,946,273 instructions # 2.23 insn per cycle - 0.473330982 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.751909e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.067933e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.067933e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.369266 sec + 1,230,024,804 cycles # 3.316 GHz + 2,957,574,250 instructions # 2.40 insn per cycle + 0.371630014 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.547680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.672650e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.672650e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.666550 sec - 1,326,556,264 cycles # 1.978 GHz - 1,952,513,162 instructions # 1.47 insn per cycle - 0.681133765 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 5.342815e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.741091e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.741091e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.331348 sec + 1,101,123,467 cycles # 3.308 GHz + 1,929,122,407 instructions # 1.75 insn per cycle + 0.333682550 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 4d40239a82..1c0f8553bf 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:36:34 +DATE: 2024-03-01_19:15:03 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.367019e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.211392e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.351303e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.439896 sec - 1,919,384,660 cycles # 2.928 GHz - 2,652,462,812 instructions # 1.38 insn per cycle - 0.728915663 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.249516e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.812359e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.959123e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.476459 sec - 2,111,535,021 cycles # 3.010 GHz - 2,984,192,787 instructions # 1.41 insn per cycle - 0.759063881 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.158503e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.184413e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.184413e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.434431 sec - 4,452,862,887 cycles # 3.097 GHz - 13,047,773,125 instructions # 2.93 insn per cycle - 1.440725517 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.488352e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.522956e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.522956e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 +TOTAL : 1.124842 sec + 3,936,915,161 cycles # 3.495 GHz + 13,032,328,324 instructions # 3.31 insn per cycle + 1.127095177 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 748) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246857540270419 Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.101216e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.298192e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.298192e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.547840 sec - 1,698,684,785 cycles # 3.077 GHz - 4,513,142,797 instructions # 2.66 insn per cycle - 0.560862800 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.897446e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.147364e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.147364e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945527e+02 +- 1.186198e+02 ) GeV^-2 +TOTAL : 0.443424 sec + 1,554,112,621 cycles # 3.493 GHz + 4,506,739,300 instructions # 2.90 insn per cycle + 0.445630602 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246859631675157 Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.089458e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.856206e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.856206e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.289099 sec - 853,788,001 cycles # 2.912 GHz - 1,897,231,072 instructions # 2.22 insn per cycle - 0.300313484 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.891149e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.978192e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.978192e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 +TOTAL : 0.230449 sec + 770,038,251 cycles # 3.320 GHz + 1,884,223,687 instructions # 2.45 insn per cycle + 0.232708341 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.510175e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.400201e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.400201e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.271830 sec - 801,479,133 cycles # 2.904 GHz - 1,820,357,988 instructions # 2.27 insn per cycle - 0.285846070 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.376104e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.503945e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.503945e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 +TOTAL : 0.218942 sec + 731,762,064 cycles # 3.320 GHz + 1,799,410,295 instructions # 2.46 insn per cycle + 0.221195439 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.997156e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.506085e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.506085e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.349567 sec - 731,841,700 cycles # 2.069 GHz - 1,305,336,291 instructions # 1.78 insn per cycle - 0.359850888 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.006759e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.177946e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.177946e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 +TOTAL : 0.186117 sec + 620,244,286 cycles # 3.306 GHz + 1,284,140,386 instructions # 2.07 insn per cycle + 0.188364686 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489383243206 Relative difference = 4.32888033512879e-08 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 441da29ffb..b407e68a76 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,237 +1,188 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_03:16:12 +DATE: 2024-03-01_19:30:44 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.711602e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.109045e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.109045e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.448633 sec - 2,014,530,108 cycles # 3.024 GHz - 2,953,646,670 instructions # 1.47 insn per cycle - 0.724573840 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.194631e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.629307e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.629307e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.616658 sec - 2,563,348,424 cycles # 3.027 GHz - 3,871,269,369 instructions # 1.51 insn per cycle - 0.904047137 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.161555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.188116e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.188116e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.433803 sec - 4,469,694,345 cycles # 3.110 GHz - 13,052,094,019 instructions # 2.92 insn per cycle - 1.437926738 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.484109e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.518859e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.518859e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 +TOTAL : 1.130053 sec + 3,951,393,993 cycles # 3.492 GHz + 13,036,916,705 instructions # 3.30 insn per cycle + 1.132358928 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 748) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246857540270419 Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.090515e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.286507e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.286507e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.554057 sec - 1,716,801,013 cycles # 3.079 GHz - 4,560,314,564 instructions # 2.66 insn per cycle - 0.558193661 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.884653e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.132509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.132509e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945527e+02 +- 1.186198e+02 ) GeV^-2 +TOTAL : 0.447778 sec + 1,569,690,605 cycles # 3.494 GHz + 4,555,017,000 instructions # 2.90 insn per cycle + 0.450120274 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246859631675157 Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.984424e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.738205e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.738205e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.297621 sec - 872,015,724 cycles # 2.894 GHz - 1,933,356,220 instructions # 2.22 insn per cycle - 0.301984624 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.847528e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.908555e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.908555e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 +TOTAL : 0.234405 sec + 783,805,869 cycles # 3.322 GHz + 1,921,295,110 instructions # 2.45 insn per cycle + 0.236707878 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.471182e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.343667e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.343667e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.276934 sec - 818,470,682 cycles # 2.917 GHz - 1,856,220,484 instructions # 2.27 insn per cycle - 0.281151541 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.304436e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.414872e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.414872e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 +TOTAL : 0.223426 sec + 747,350,874 cycles # 3.322 GHz + 1,836,566,153 instructions # 2.46 insn per cycle + 0.225745389 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.926101e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.412906e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.412906e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.358667 sec - 751,185,964 cycles # 2.073 GHz - 1,346,032,296 instructions # 1.79 insn per cycle - 0.362975431 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.000891e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.167702e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.167702e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 +TOTAL : 0.190037 sec + 635,927,755 cycles # 3.319 GHz + 1,325,913,533 instructions # 2.09 insn per cycle + 0.192251442 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489383243206 Relative difference = 4.32888033512879e-08 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 8918bec5c8..5b9052bbb4 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:36:50 +DATE: 2024-03-01_19:15:10 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.307953e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.201255e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.336658e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.436130 sec - 1,959,442,257 cycles # 3.009 GHz - 2,743,667,126 instructions # 1.40 insn per cycle - 0.720037686 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.165076e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.782519e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.922757e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.476114 sec - 2,116,952,174 cycles # 3.025 GHz - 3,000,364,507 instructions # 1.42 insn per cycle - 0.758577490 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.155211e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.181167e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.181167e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.438010 sec - 4,446,707,539 cycles # 3.084 GHz - 13,028,651,848 instructions # 2.93 insn per cycle - 1.444314220 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.490551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.525624e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.525624e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 +TOTAL : 1.123253 sec + 3,928,398,999 cycles # 3.493 GHz + 13,013,265,295 instructions # 3.31 insn per cycle + 1.125553354 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 732) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246857540270419 Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.098425e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.294299e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.294299e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.547784 sec - 1,696,823,876 cycles # 3.074 GHz - 4,509,092,353 instructions # 2.66 insn per cycle - 0.559046282 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 3.877785e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.124762e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.124762e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945527e+02 +- 1.186198e+02 ) GeV^-2 +TOTAL : 0.445530 sec + 1,559,423,077 cycles # 3.489 GHz + 4,502,604,517 instructions # 2.89 insn per cycle + 0.447755001 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246859631675157 Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.019219e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.763141e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.763141e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.292180 sec - 859,590,330 cycles # 2.901 GHz - 1,893,994,453 instructions # 2.20 insn per cycle - 0.304986924 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 7.903710e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.988700e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.988700e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 +TOTAL : 0.230083 sec + 768,596,123 cycles # 3.320 GHz + 1,881,278,062 instructions # 2.45 insn per cycle + 0.232327253 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.549494e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.438482e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.438482e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.269638 sec - 798,515,936 cycles # 2.915 GHz - 1,816,168,831 instructions # 2.27 insn per cycle - 0.281600896 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 8.360577e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.493697e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.493697e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 +TOTAL : 0.219203 sec + 732,515,894 cycles # 3.320 GHz + 1,795,625,953 instructions # 2.45 insn per cycle + 0.221411578 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.914139e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.405725e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.405725e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.355005 sec - 734,840,966 cycles # 2.046 GHz - 1,303,017,912 instructions # 1.77 insn per cycle - 0.365594980 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.010951e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181673e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.181673e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 +TOTAL : 0.185361 sec + 618,797,353 cycles # 3.311 GHz + 1,281,956,446 instructions # 2.07 insn per cycle + 0.187631017 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489383243206 Relative difference = 4.32888033512879e-08 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 9473075c44..0e5cce6fad 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:37:07 +DATE: 2024-03-01_19:15:17 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.657865e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.342545e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.715127e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444064 sec - 2,011,501,510 cycles # 2.996 GHz - 2,813,725,950 instructions # 1.40 insn per cycle - 0.745188123 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.264913e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.129230e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.558122e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.531362 sec - 2,289,898,203 cycles # 2.976 GHz - 3,193,334,828 instructions # 1.39 insn per cycle - 0.827090728 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.087550e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.110443e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.110443e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.528426 sec - 4,733,772,591 cycles # 3.090 GHz - 13,465,129,433 instructions # 2.84 insn per cycle - 1.534888113 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.380422e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.409222e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.409222e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.212271 sec + 4,238,626,091 cycles # 3.492 GHz + 13,408,625,650 instructions # 3.16 insn per cycle + 1.214366196 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 836) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.994397e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.071792e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.071792e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.843067 sec - 2,603,799,246 cycles # 3.073 GHz - 7,385,481,301 instructions # 2.84 insn per cycle - 0.853727039 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.553855e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.651544e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.651544e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.666771 sec + 2,329,497,090 cycles # 3.485 GHz + 7,376,535,398 instructions # 3.17 insn per cycle + 0.669166331 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.410870e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.639370e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.639370e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.502006 sec - 1,465,753,503 cycles # 2.896 GHz - 3,056,435,528 instructions # 2.09 insn per cycle - 0.511483566 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.590474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.884942e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.884942e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.381079 sec + 1,268,726,163 cycles # 3.316 GHz + 3,041,731,155 instructions # 2.40 insn per cycle + 0.383435781 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.873726e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164501e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.164501e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444397 sec - 1,302,869,174 cycles # 2.905 GHz - 2,931,108,724 instructions # 2.25 insn per cycle - 0.456529729 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.865460e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.199799e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.199799e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.361363 sec + 1,203,719,519 cycles # 3.317 GHz + 2,908,941,396 instructions # 2.42 insn per cycle + 0.363683085 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.488835e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.605728e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.605728e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.681918 sec - 1,362,782,748 cycles # 1.986 GHz - 1,970,355,079 instructions # 1.45 insn per cycle - 0.693685126 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 5.111885e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.475216e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.475216e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.345104 sec + 1,149,300,201 cycles # 3.315 GHz + 1,946,685,540 instructions # 1.69 insn per cycle + 0.347411360 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index f04f8628ac..b85d55e45a 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,220 +1,183 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasCurand +HASCURAND=hasNoCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_02:37:24 +DATE: 2024-03-01_19:15:25 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.658641e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.216275e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.578681e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.445224 sec - 1,992,469,002 cycles # 2.992 GHz - 2,813,148,728 instructions # 1.41 insn per cycle - 0.736789901 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% -......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.263173e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.989199e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.385950e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.532147 sec - 2,297,521,664 cycles # 2.990 GHz - 3,210,517,070 instructions # 1.40 insn per cycle - 0.827894226 seconds time elapsed -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 -OK (relative difference <= 5E-3) +Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.091329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.113996e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.113996e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.523445 sec - 4,724,741,346 cycles # 3.094 GHz - 13,451,257,746 instructions # 2.85 insn per cycle - 1.529633779 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 1.377531e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.405985e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.405985e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.214658 sec + 4,246,718,650 cycles # 3.492 GHz + 13,407,960,809 instructions # 3.16 insn per cycle + 1.216936825 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 826) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.010329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.087455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.087455e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.835617 sec - 2,595,186,002 cycles # 3.089 GHz - 7,389,201,553 instructions # 2.85 insn per cycle - 0.854907608 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 2.553314e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.650058e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.650058e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.665859 sec + 2,332,786,191 cycles # 3.495 GHz + 7,378,270,783 instructions # 3.16 insn per cycle + 0.668166884 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.399802e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.624427e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.624427e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.503119 sec - 1,466,604,979 cycles # 2.890 GHz - 3,056,260,975 instructions # 2.08 insn per cycle - 0.515296062 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.588001e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.879616e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.879616e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.381270 sec + 1,269,081,565 cycles # 3.315 GHz + 3,041,458,855 instructions # 2.40 insn per cycle + 0.383572552 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.762321e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.040429e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.040429e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.457389 sec - 1,310,592,019 cycles # 2.838 GHz - 2,931,897,706 instructions # 2.24 insn per cycle - 0.469608344 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 4.863319e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.195493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.195493e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.361431 sec + 1,203,784,149 cycles # 3.317 GHz + 2,909,476,469 instructions # 2.42 insn per cycle + 0.363729643 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.462138e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.577756e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.577756e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.689340 sec - 1,364,202,689 cycles # 1.967 GHz - 1,970,285,028 instructions # 1.44 insn per cycle - 0.699058633 seconds time elapsed +OMP threads / `nproc --all` = 1 / 32 +EvtsPerSec[Rmb+ME] (23) = ( 5.102649e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.463171e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.463171e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 0.345690 sec + 1,150,403,593 cycles # 3.313 GHz + 1,946,604,480 instructions # 1.69 insn per cycle + 0.347985872 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 3 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 From ca849bd6ada6bb622f90ce6989ceaa3292b0ad6b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 11:51:15 +0100 Subject: [PATCH 91/96] [susy2] rerun 18 tmad tests on itgold91 for the first time, all ok - no GPU, but a Gold AVX512 CPU (Eventually the outputs of this test will be split between CPU and GPU...) --- .../log_eemumu_mad_d_inl0_hrd0.txt | 308 ++++++----------- .../log_eemumu_mad_f_inl0_hrd0.txt | 310 ++++++----------- .../log_eemumu_mad_m_inl0_hrd0.txt | 306 ++++++----------- .../log_ggtt_mad_d_inl0_hrd0.txt | 308 ++++++----------- .../log_ggtt_mad_f_inl0_hrd0.txt | 306 ++++++----------- .../log_ggtt_mad_m_inl0_hrd0.txt | 304 ++++++----------- .../log_ggttg_mad_d_inl0_hrd0.txt | 306 ++++++----------- .../log_ggttg_mad_f_inl0_hrd0.txt | 312 ++++++----------- .../log_ggttg_mad_m_inl0_hrd0.txt | 308 ++++++----------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 306 ++++++----------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 308 ++++++----------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 310 ++++++----------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 312 ++++++----------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 316 ++++++------------ .../log_ggttggg_mad_m_inl0_hrd0.txt | 310 ++++++----------- .../log_gqttq_mad_d_inl0_hrd0.txt | 310 ++++++----------- .../log_gqttq_mad_f_inl0_hrd0.txt | 312 ++++++----------- .../log_gqttq_mad_m_inl0_hrd0.txt | 310 ++++++----------- 18 files changed, 1827 insertions(+), 3735 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index fb2022a061..a540c52e3f 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_03:35:28 +DATE: 2024-03-01_19:37:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.6832s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6748s - [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4698s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4632s + [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1761s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1679s - [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1232s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1166s + [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.23E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3673s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s - [COUNTERS] Fortran MEs ( 1 ) : 0.0874s for 90112 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2640s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1959s + [COUNTERS] Fortran MEs ( 1 ) : 0.0681s for 90112 events => throughput is 1.32E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1811s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1745s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1273s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1218s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0056s for 8192 events => throughput is 1.47E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3805s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3053s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0752s for 90112 events => throughput is 1.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2606s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2024s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0582s for 90112 events => throughput is 1.55E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.174335e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.577544e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.235605e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.582008e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1874s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1831s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.89E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1234s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1201s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.48E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3298s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2854s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0444s for 90112 events => throughput is 2.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2364s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2023s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0341s for 90112 events => throughput is 2.64E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.003456e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.707525e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.071261e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.816473e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1738s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1708s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.75E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1205s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1181s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.35E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3165s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2833s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 90112 events => throughput is 2.71E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2272s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2006s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0266s for 90112 events => throughput is 3.39E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.590204e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.530014e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.724231e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.685057e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1747s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1718s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.78E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1203s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1180s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.50E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3170s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2851s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0318s for 90112 events => throughput is 2.83E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2270s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 90112 events => throughput is 3.52E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.651963e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.623987e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.888816e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.843267e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1756s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1721s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1200s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1182s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.42E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3283s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 90112 events => throughput is 2.28E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2195s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2001s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 90112 events => throughput is 4.66E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.333417e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.833138e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.247580e+06 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5894s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5889s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7068s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7019s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.143768e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.922192e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.720542e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.434610e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.732238e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.027929e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.748145e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.129848e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.160270e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 130936da07..01d3bcd1ad 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y + +make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_03:35:44 +DATE: 2024-03-01_19:37:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7004s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6920s - [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4777s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4710s + [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.23E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1752s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1674s - [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1239s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1173s + [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3760s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s - [COUNTERS] Fortran MEs ( 1 ) : 0.0872s for 90112 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2678s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2000s + [COUNTERS] Fortran MEs ( 1 ) : 0.0678s for 90112 events => throughput is 1.33E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382703205998396E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1795s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1733s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1242s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1193s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 8192 events => throughput is 1.67E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515590123565249E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3578s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2889s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0690s for 90112 events => throughput is 1.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2527s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2015s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0512s for 90112 events => throughput is 1.76E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515590123565249E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.296058e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.824486e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.289423e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.843269e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1759s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1734s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1192s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1173s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.20E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3141s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0281s for 90112 events => throughput is 3.21E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2180s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1970s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0210s for 90112 events => throughput is 4.30E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587612890761E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.247103e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.470047e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.346461e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.654667e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1759s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1735s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.49E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1210s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1192s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.53E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3181s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2923s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 90112 events => throughput is 3.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2216s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2017s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0198s for 90112 events => throughput is 4.55E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.473027e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.847882e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.779574e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.058914e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1764s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1741s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1207s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1189s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.54E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3134s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2887s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0247s for 90112 events => throughput is 3.65E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2203s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2008s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0195s for 90112 events => throughput is 4.62E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.393313e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.901436e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.850238e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.119507e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382704335459282E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1726s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.46E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1190s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1178s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0012s for 8192 events => throughput is 6.69E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515591296252558E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3156s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2901s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0255s for 90112 events => throughput is 3.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2147s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2012s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0134s for 90112 events => throughput is 6.70E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591296252558E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.340689e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.099144e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.795181e+06 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5861s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.74E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382706077425631E-002) differ by less than 4E-4 (9.988182347875352e-08) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7024s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 1.98E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515592892887687E-002) differ by less than 4E-4 (9.973286385633884e-08) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.528794e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.178202e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.848804e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.051133e+09 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.014035e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.222690e+09 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.412951e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.409232e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.624026e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index da7367ae5e..aa33fb4f59 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' +make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y +make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_03:36:01 +DATE: 2024-03-01_19:37:16 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.7189s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7106s - [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4707s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4640s + [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.23E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1780s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1693s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1250s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1183s + [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3702s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2823s - [COUNTERS] Fortran MEs ( 1 ) : 0.0879s for 90112 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2620s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1946s + [COUNTERS] Fortran MEs ( 1 ) : 0.0674s for 90112 events => throughput is 1.34E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701395E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1808s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1742s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1278s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1222s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0056s for 8192 events => throughput is 1.45E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3592s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0732s for 90112 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2613s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2018s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0594s for 90112 events => throughput is 1.52E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.182030e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.544290e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.222787e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.541211e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1746s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1707s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.07E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1220s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1189s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.65E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3256s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 90112 events => throughput is 2.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2330s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1999s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 90112 events => throughput is 2.72E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.086150e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.764036e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.131619e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.888476e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1726s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1696s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.76E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1210s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1185s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.38E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3229s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2892s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0338s for 90112 events => throughput is 2.67E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2266s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2002s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0264s for 90112 events => throughput is 3.42E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.541763e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.562659e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.454900e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.728436e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1830s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1799s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.61E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1208s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1185s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.57E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3167s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2848s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 90112 events => throughput is 2.83E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2250s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1999s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0251s for 90112 events => throughput is 3.59E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.677035e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.748896e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.872617e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.946216e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1756s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1722s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.36E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1190s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1172s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0017s for 8192 events => throughput is 4.73E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.3270s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2890s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0380s for 90112 events => throughput is 2.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2180s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1992s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0188s for 90112 events => throughput is 4.80E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.248118e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.001566e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.400436e+06 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.5902s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5897s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.66E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715392009194E-002) differ by less than 2E-4 (1.3548906441229747e-10) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.09152 [9.1515602021089631E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.7064s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7016s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.87E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602021089631E-002) differ by less than 2E-4 (1.1898038110302878e-11) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.153365e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.922960e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.732117e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.451486e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.736678e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.069247e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.733211e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.156375e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.332952e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 657075d34f..f736b14f89 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' +make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y - make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_03:36:18 +DATE: 2024-03-01_19:37:20 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.8052s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7640s - [COUNTERS] Fortran MEs ( 1 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5241s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4919s + [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.55E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3849s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3442s - [COUNTERS] Fortran MEs ( 1 ) : 0.0408s for 8192 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2693s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2371s + [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.54E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6297s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1833s - [COUNTERS] Fortran MEs ( 1 ) : 0.4464s for 90112 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2093s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8563s + [COUNTERS] Fortran MEs ( 1 ) : 0.3530s for 90112 events => throughput is 2.55E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4282s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3910s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0373s for 8192 events => throughput is 2.20E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2948s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2654s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0294s for 8192 events => throughput is 2.78E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6901s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2834s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4066s for 90112 events => throughput is 2.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2065s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8827s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3238s for 90112 events => throughput is 2.78E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.207121e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.789923e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.224007e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.858469e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3927s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 8192 events => throughput is 3.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2706s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2539s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0167s for 8192 events => throughput is 4.90E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4997s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2628s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2369s for 90112 events => throughput is 3.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0538s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8701s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1836s for 90112 events => throughput is 4.91E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989106) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.699229e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.827269e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.772412e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.939228e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3811s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.20E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2562s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2462s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0101s for 8192 events => throughput is 8.13E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4003s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2548s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1455s for 90112 events => throughput is 6.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9712s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8607s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1105s for 90112 events => throughput is 8.16E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.020313e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.441330e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.141769e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.515876e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3737s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3616s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2546s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2450s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 8192 events => throughput is 8.54E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3804s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2520s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1283s for 90112 events => throughput is 7.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9629s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8596s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1033s for 90112 events => throughput is 8.72E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.898875e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.937684e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.924828e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.116082e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3919s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3726s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 8192 events => throughput is 4.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2551s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2456s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5267s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3034s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2233s for 90112 events => throughput is 4.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9646s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8611s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1035s for 90112 events => throughput is 8.71E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.791161e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.551823e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.782832e+05 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7828s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7823s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6782s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6718s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.045663e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.714246e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.010596e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.071675e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.000853e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.152555e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.001515e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.100234e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.755678e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index eb011c6697..235aa30713 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none @@ -6,37 +6,37 @@ make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y - make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_03:36:45 +DATE: 2024-03-01_19:37:30 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.7779s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7373s - [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5257s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4936s + [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.55E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3888s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3479s - [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2688s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2366s + [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.54E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6449s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1962s - [COUNTERS] Fortran MEs ( 1 ) : 0.4487s for 90112 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2051s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8514s + [COUNTERS] Fortran MEs ( 1 ) : 0.3537s for 90112 events => throughput is 2.55E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094179780921394] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4205s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0345s for 8192 events => throughput is 2.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2903s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2635s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0268s for 8192 events => throughput is 3.06E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105688579298537] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6592s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2787s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3805s for 90112 events => throughput is 2.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1743s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8796s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2947s for 90112 events => throughput is 3.06E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105688579298537) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.351307e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.132672e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.338637e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.165533e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094175850060040] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0148s for 8192 events => throughput is 5.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2584s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2465s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0119s for 8192 events => throughput is 6.88E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684763984058] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4203s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2581s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1622s for 90112 events => throughput is 5.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9906s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8610s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1296s for 90112 events => throughput is 6.95E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105684763984058) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.210465e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.797834e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.317035e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.867367e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3679s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3602s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2496s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2435s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0061s for 8192 events => throughput is 1.34E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3368s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2515s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0853s for 90112 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.9271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8599s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0672s for 90112 events => throughput is 1.34E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.038889e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.382080e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.040818e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.412593e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3685s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3613s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.13E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2471s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2414s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0057s for 8192 events => throughput is 1.43E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3377s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2584s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0793s for 90112 events => throughput is 1.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.9222s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8592s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0630s for 90112 events => throughput is 1.43E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.104729e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.495407e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.124265e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.513487e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094178213275804] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3726s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3621s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0105s for 8192 events => throughput is 7.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2466s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2415s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 8192 events => throughput is 1.58E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105688407939567] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3760s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2629s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 90112 events => throughput is 7.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9147s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8576s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0571s for 90112 events => throughput is 1.58E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105688407939567) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.591310e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.606554e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.407728e+05 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184344050284] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7795s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7789s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.48E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094184344050284) differ by less than 4E-4 (9.761425112664313e-09) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105694586476879] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.7005s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6948s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0057s for 90112 events => throughput is 1.57E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989114) and cpp (47.105694586476879) differ by less than 4E-4 (1.4722471020078842e-08) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.201563e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.986974e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.810580e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.774762e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.802177e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.847890e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.368745e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.422351e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.630447e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index bef66309f6..8034ca9ebb 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none @@ -6,37 +6,37 @@ make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_03:37:12 +DATE: 2024-03-01_19:37:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.7917s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7505s - [COUNTERS] Fortran MEs ( 1 ) : 0.0413s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5244s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4923s + [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.55E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3956s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s - [COUNTERS] Fortran MEs ( 1 ) : 0.0410s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2689s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2362s + [COUNTERS] Fortran MEs ( 1 ) : 0.0326s for 8192 events => throughput is 2.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6496s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1993s - [COUNTERS] Fortran MEs ( 1 ) : 0.4503s for 90112 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2077s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8533s + [COUNTERS] Fortran MEs ( 1 ) : 0.3545s for 90112 events => throughput is 2.54E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.4256s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3880s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0377s for 8192 events => throughput is 2.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2956s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2655s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0300s for 8192 events => throughput is 2.73E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6990s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2867s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4123s for 90112 events => throughput is 2.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2105s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8806s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3299s for 90112 events => throughput is 2.73E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006634) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.185122e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.728057e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.177902e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.785272e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3989s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3779s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0210s for 8192 events => throughput is 3.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2690s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2525s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0165s for 8192 events => throughput is 4.98E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.5038s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2721s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2317s for 90112 events => throughput is 3.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0454s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8647s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1806s for 90112 events => throughput is 4.99E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006626) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.744718e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.062202e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.796645e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.973810e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3789s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3661s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2557s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2457s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.15E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4021s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2588s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1433s for 90112 events => throughput is 6.29E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9697s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8600s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1097s for 90112 events => throughput is 8.21E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.012402e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.485845e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.056070e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.626997e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3770s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3652s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0118s for 8192 events => throughput is 6.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2541s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2449s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0093s for 8192 events => throughput is 8.84E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.3844s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2567s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1278s for 90112 events => throughput is 7.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9619s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8601s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1018s for 90112 events => throughput is 8.85E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.957699e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.108224e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.976096e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.251964e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.3910s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3720s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0190s for 8192 events => throughput is 4.32E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2544s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2450s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.73E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.4767s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2708s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2059s for 90112 events => throughput is 4.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.9643s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8623s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1020s for 90112 events => throughput is 8.83E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.223304e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.699461e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.269412e+05 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1 - [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.7843s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7837s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.37E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.094184803756640) and cpp (47.094184798437830) differ by less than 2E-4 (1.1293987967064822e-10) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1 - [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.6837s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6773s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.40E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.105695279989114) and cpp (47.105695279068492) differ by less than 2E-4 (1.954369999168648e-11) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.090244e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.672934e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.997070e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.055834e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.991192e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.134835e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.012024e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.999333e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.028075e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index cd3823dd45..36f4d3601a 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:37:39 +DATE: 2024-03-01_19:37:50 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6990s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3830s - [COUNTERS] Fortran MEs ( 1 ) : 0.3160s for 8192 events => throughput is 2.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4918s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2352s + [COUNTERS] Fortran MEs ( 1 ) : 0.2567s for 8192 events => throughput is 3.19E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6286s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3119s - [COUNTERS] Fortran MEs ( 1 ) : 0.3167s for 8192 events => throughput is 2.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4671s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2109s + [COUNTERS] Fortran MEs ( 1 ) : 0.2562s for 8192 events => throughput is 3.20E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 4.9846s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4835s - [COUNTERS] Fortran MEs ( 1 ) : 3.5010s for 90112 events => throughput is 2.57E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.8468s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0281s + [COUNTERS] Fortran MEs ( 1 ) : 2.8187s for 90112 events => throughput is 3.20E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 1.0129s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6673s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3457s for 8192 events => throughput is 2.37E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7285s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4659s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2625s for 8192 events => throughput is 3.12E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.3959s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8159s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5801s for 90112 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.1673s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2870s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.8803s for 90112 events => throughput is 3.13E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.608629e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.206647e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.585600e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.221936e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6456s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4771s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1686s for 8192 events => throughput is 4.86E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4770s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3418s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1352s for 8192 events => throughput is 6.06E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.4996s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6435s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8561s for 90112 events => throughput is 4.85E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6522s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1622s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.4900s for 90112 events => throughput is 6.05E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.966202e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.254884e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.937901e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.291268e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4803s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3957s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0846s for 8192 events => throughput is 9.68E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3368s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2730s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0638s for 8192 events => throughput is 1.28E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5052s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5758s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9294s for 90112 events => throughput is 9.70E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.7929s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0905s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7023s for 90112 events => throughput is 1.28E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.913999e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.314146e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.849874e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.312940e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4790s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3971s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0818s for 8192 events => throughput is 1.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3290s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2694s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0596s for 8192 events => throughput is 1.37E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.3870s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5613s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8257s for 90112 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7376s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0829s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6547s for 90112 events => throughput is 1.38E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.099230e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.414630e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.125635e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.418410e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5283s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4199s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1084s for 8192 events => throughput is 7.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.2981s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2548s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.7535s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5980s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1555s for 90112 events => throughput is 7.80E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.5391s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4712s for 90112 events => throughput is 1.91E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.774058e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.907849e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.841638e+04 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7462s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7408s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749111) differ by less than 3E-14 (0.0) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481932717736E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9272s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9044s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717736E-002) differ by less than 3E-14 (2.220446049250313e-16) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.632538e+06 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.097542e+06 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.673182e+06 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.241730e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.666883e+06 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.250394e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.680746e+06 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.758368e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.952413e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index b22193f403..24c504a5ca 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y + +make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:38:22 +DATE: 2024-03-01_19:38:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6628s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3454s - [COUNTERS] Fortran MEs ( 1 ) : 0.3174s for 8192 events => throughput is 2.58E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4906s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2344s + [COUNTERS] Fortran MEs ( 1 ) : 0.2562s for 8192 events => throughput is 3.20E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3130s - [COUNTERS] Fortran MEs ( 1 ) : 0.3165s for 8192 events => throughput is 2.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4674s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2112s + [COUNTERS] Fortran MEs ( 1 ) : 0.2562s for 8192 events => throughput is 3.20E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 4.9722s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s - [COUNTERS] Fortran MEs ( 1 ) : 3.4891s for 90112 events => throughput is 2.58E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.8381s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0198s + [COUNTERS] Fortran MEs ( 1 ) : 2.8183s for 90112 events => throughput is 3.20E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112722621426752] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.9336s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6187s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3149s for 8192 events => throughput is 2.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7110s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4576s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2534s for 8192 events => throughput is 3.23E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238468310179624E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.3385s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8107s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5278s for 90112 events => throughput is 2.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.0696s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2755s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.7941s for 90112 events => throughput is 3.23E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238468310179624E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.649087e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.313377e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.678753e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.325155e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112720710186394] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4987s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4044s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0944s for 8192 events => throughput is 8.68E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3591s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0757s for 8192 events => throughput is 1.08E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238454786658835E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.5977s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5622s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0354s for 90112 events => throughput is 8.70E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.9379s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1108s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8271s for 90112 events => throughput is 1.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238454786658835E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.791493e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.114703e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.818254e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.120515e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4003s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3576s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2419s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.47E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9911s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5156s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4755s for 90112 events => throughput is 1.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4282s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0644s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3638s for 90112 events => throughput is 2.48E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.915431e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.549677e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.928091e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.559529e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3910s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3509s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0400s for 8192 events => throughput is 2.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2701s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2394s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0307s for 8192 events => throughput is 2.67E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9540s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5218s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4322s for 90112 events => throughput is 2.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3923s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3377s for 90112 events => throughput is 2.67E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.113903e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.759664e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.128293e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.772533e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112723387847480] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4175s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3658s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0516s for 8192 events => throughput is 1.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2548s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2344s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0203s for 8192 events => throughput is 4.03E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238464410949921E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.0938s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5331s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5607s for 90112 events => throughput is 1.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2671s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0435s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2236s for 90112 events => throughput is 4.03E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464410949921E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.580486e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.036698e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.544942e+05 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112726034625694] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7476s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7467s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.72E+06 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112726034625694) differ by less than 4E-4 (2.2321452152196386e-06) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238473828077680E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9141s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9047s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 90112 events => throughput is 9.57E+06 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238473828077680E-002) differ by less than 4E-4 (1.0228161673175862e-07) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.317603e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.855249e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.653705e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.471958e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.666794e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.507869e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.515295e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.625829e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.079449e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 994bc4f8f2..3b804b916f 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y + +make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:39:00 +DATE: 2024-03-01_19:38:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.6750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3573s - [COUNTERS] Fortran MEs ( 1 ) : 0.3178s for 8192 events => throughput is 2.58E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4914s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2347s + [COUNTERS] Fortran MEs ( 1 ) : 0.2568s for 8192 events => throughput is 3.19E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6388s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3205s - [COUNTERS] Fortran MEs ( 1 ) : 0.3183s for 8192 events => throughput is 2.57E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4663s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2100s + [COUNTERS] Fortran MEs ( 1 ) : 0.2563s for 8192 events => throughput is 3.20E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.0099s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5073s - [COUNTERS] Fortran MEs ( 1 ) : 3.5026s for 90112 events => throughput is 2.57E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.8461s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0203s + [COUNTERS] Fortran MEs ( 1 ) : 2.8258s for 90112 events => throughput is 3.19E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.9635s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6336s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3299s for 8192 events => throughput is 2.48E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7487s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4785s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2702s for 8192 events => throughput is 3.03E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 5.4154s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7958s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6196s for 90112 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.2292s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2873s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.9419s for 90112 events => throughput is 3.06E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482679400354E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.562106e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.165447e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.547562e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.152487e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748702805033] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.6394s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1654s for 8192 events => throughput is 4.95E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4799s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3471s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1328s for 8192 events => throughput is 6.17E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482683055667E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.4743s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6457s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8285s for 90112 events => throughput is 4.93E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6010s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1530s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.4480s for 90112 events => throughput is 6.22E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482683055667E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.063467e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.328779e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.051938e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.375772e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4771s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3935s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 8192 events => throughput is 9.80E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3422s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2786s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0635s for 8192 events => throughput is 1.29E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.4752s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5541s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9211s for 90112 events => throughput is 9.78E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.7929s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0938s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6991s for 90112 events => throughput is 1.29E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.001861e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.324352e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.840887e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.329871e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4581s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3846s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0735s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3263s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2680s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0583s for 8192 events => throughput is 1.41E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.3626s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5499s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8127s for 90112 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7235s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0825s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6410s for 90112 events => throughput is 1.41E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.148207e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.436389e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.152825e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.440980e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748700265108] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.5403s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1136s for 8192 events => throughput is 7.21E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3043s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2596s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0447s for 8192 events => throughput is 1.83E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482666076374E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.7973s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6097s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1876s for 90112 events => throughput is 7.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.5642s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0705s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4936s for 90112 events => throughput is 1.83E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482666076374E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.609614e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.857254e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.592843e+04 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1 - [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7459s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7405s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748601943165) differ by less than 2E-4 (5.74121417074025e-10) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9191s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8964s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.96E+06 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481937154381E-002) differ by less than 2E-4 (5.5991211667105745e-11) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624489e+06 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.862423e+06 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.598562e+06 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.230160e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.604858e+06 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.241022e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.618302e+06 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.712384e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.864849e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 455a867420..10cb20bbb1 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' +make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y - make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:39:43 +DATE: 2024-03-01_19:38:52 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.5262s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3968s - [COUNTERS] Fortran MEs ( 1 ) : 4.1295s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6842s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2355s + [COUNTERS] Fortran MEs ( 1 ) : 3.4488s for 8192 events => throughput is 2.38E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.4601s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3420s - [COUNTERS] Fortran MEs ( 1 ) : 4.1180s for 8192 events => throughput is 1.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6715s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2301s + [COUNTERS] Fortran MEs ( 1 ) : 3.4414s for 8192 events => throughput is 2.38E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 47.7126s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0517s - [COUNTERS] Fortran MEs ( 1 ) : 45.6608s for 90112 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 39.6093s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4241s + [COUNTERS] Fortran MEs ( 1 ) : 38.1852s for 90112 events => throughput is 2.36E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 8.7056s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4601s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.2455s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 6.9402s + [COUNTERS] Fortran Overhead ( 0 ) : 3.5336s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.4065s for 8192 events => throughput is 2.40E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 53.1561s - [COUNTERS] Fortran Overhead ( 0 ) : 6.1171s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.0390s for 90112 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 42.2863s + [COUNTERS] Fortran Overhead ( 0 ) : 4.7174s + [COUNTERS] CudaCpp MEs ( 2 ) : 37.5689s for 90112 events => throughput is 2.40E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451704E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.989312e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.479889e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.975004e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.479923e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.7773s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5170s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2604s for 8192 events => throughput is 3.62E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.7209s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9393s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.7816s for 8192 events => throughput is 4.60E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451701E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 29.0103s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1559s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.8544s for 90112 events => throughput is 3.63E+03 events/s + [COUNTERS] PROGRAM TOTAL : 22.6693s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0997s + [COUNTERS] CudaCpp MEs ( 2 ) : 19.5696s for 90112 events => throughput is 4.60E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451701E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.801009e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.828014e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.781734e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.821992e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.2569s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2848s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9721s for 8192 events => throughput is 8.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.7069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9577s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7492s for 8192 events => throughput is 1.09E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 13.7501s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9426s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.8075s for 90112 events => throughput is 8.34E+03 events/s + [COUNTERS] PROGRAM TOTAL : 10.3842s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1372s + [COUNTERS] CudaCpp MEs ( 2 ) : 8.2470s for 90112 events => throughput is 1.09E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.607758e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.131656e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.615061e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.130829e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.0290s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1707s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8583s for 8192 events => throughput is 9.54E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.5295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8703s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6591s for 8192 events => throughput is 1.24E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 12.2922s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8302s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4620s for 90112 events => throughput is 9.52E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.2923s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0408s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.2515s for 90112 events => throughput is 1.24E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.701965e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.282050e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.814187e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.280247e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.5040s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4024s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1017s for 8192 events => throughput is 7.44E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.0471s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6354s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4117s for 8192 events => throughput is 1.99E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 14.7910s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0412s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.7498s for 90112 events => throughput is 7.67E+03 events/s + [COUNTERS] PROGRAM TOTAL : 6.3290s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8022s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.5269s for 90112 events => throughput is 1.99E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.831586e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.029838e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.821061e+03 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8686s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8368s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.57E+05 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.8233s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4732s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3501s for 90112 events => throughput is 2.57E+05 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.280922e+05 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.518844e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.106750e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.162850e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.106625e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.168282e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.107369e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.430988e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.031463e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 5e945a4db8..cf2dcec7e0 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y + +make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:43:57 +DATE: 2024-03-01_19:41:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.4989s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3403s - [COUNTERS] Fortran MEs ( 1 ) : 4.1586s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6839s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2360s + [COUNTERS] Fortran MEs ( 1 ) : 3.4480s for 8192 events => throughput is 2.38E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.4650s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3366s - [COUNTERS] Fortran MEs ( 1 ) : 4.1284s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6716s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2300s + [COUNTERS] Fortran MEs ( 1 ) : 3.4416s for 8192 events => throughput is 2.38E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 47.5707s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0343s - [COUNTERS] Fortran MEs ( 1 ) : 45.5364s for 90112 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 39.2771s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4084s + [COUNTERS] Fortran MEs ( 1 ) : 37.8687s for 90112 events => throughput is 2.38E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703729438336302E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 8.4568s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3045s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.1523s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 6.7808s + [COUNTERS] Fortran Overhead ( 0 ) : 3.4578s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.3230s for 8192 events => throughput is 2.47E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793486626492658E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 51.1261s - [COUNTERS] Fortran Overhead ( 0 ) : 5.9844s - [COUNTERS] CudaCpp MEs ( 2 ) : 45.1417s for 90112 events => throughput is 2.00E+03 events/s + [COUNTERS] PROGRAM TOTAL : 41.2049s + [COUNTERS] Fortran Overhead ( 0 ) : 4.6268s + [COUNTERS] CudaCpp MEs ( 2 ) : 36.5781s for 90112 events => throughput is 2.46E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486626492658E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.070377e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.540625e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.032691e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.542609e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703722581317850E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.5531s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4379s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1153s for 8192 events => throughput is 7.35E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.8927s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0485s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8442s for 8192 events => throughput is 9.70E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793483759856148E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 15.4011s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1124s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.2887s for 90112 events => throughput is 7.33E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.5324s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2190s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.3134s for 90112 events => throughput is 9.68E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483759856148E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.468143e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.994672e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.493623e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.997533e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.3122s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8184s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4938s for 8192 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9673s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5920s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3753s for 8192 events => throughput is 2.18E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 7.8863s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4589s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4274s for 90112 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.9096s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7724s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1372s for 90112 events => throughput is 2.18E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.689224e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.262354e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.712522e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.259720e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.1887s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7547s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4340s for 8192 events => throughput is 1.89E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8878s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5548s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3330s for 8192 events => throughput is 2.46E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 7.2113s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4166s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.7946s for 90112 events => throughput is 1.88E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3953s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7248s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.6705s for 90112 events => throughput is 2.46E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.812765e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.532533e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.800388e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.533837e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703728658657426E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.4119s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8827s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5292s for 8192 events => throughput is 1.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6309s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4278s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2032s for 8192 events => throughput is 4.03E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793486977281547E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 8.3753s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5229s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.8525s for 90112 events => throughput is 1.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.8256s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5943s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2313s for 90112 events => throughput is 4.04E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486977281547E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.556546e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.132811e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.565832e+04 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703736267486325E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8334s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8120s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.82E+05 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703736267486325E-004) differ by less than 4E-4 (3.1975667371675343e-06) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793489323670813E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.7017s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4654s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2363s for 90112 events => throughput is 3.81E+05 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793489323670813E-004) differ by less than 4E-4 (3.20900471706409e-06) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.592263e+05 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.940482e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.499807e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.638317e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.497540e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.635301e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.483569e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.518477e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.135906e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 4a1ef98d00..d3fac22074 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' +make USEBUILDDIR=1 AVX=none + +make USEBUILDDIR=1 AVX=512y -make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y - make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:47:17 +DATE: 2024-03-01_19:43:57 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 4.4720s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3418s - [COUNTERS] Fortran MEs ( 1 ) : 4.1302s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6772s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2339s + [COUNTERS] Fortran MEs ( 1 ) : 3.4433s for 8192 events => throughput is 2.38E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.4586s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3357s - [COUNTERS] Fortran MEs ( 1 ) : 4.1229s for 8192 events => throughput is 1.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6711s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2305s + [COUNTERS] Fortran MEs ( 1 ) : 3.4406s for 8192 events => throughput is 2.38E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 47.6222s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0494s - [COUNTERS] Fortran MEs ( 1 ) : 45.5728s for 90112 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 39.2987s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4107s + [COUNTERS] Fortran MEs ( 1 ) : 37.8881s for 90112 events => throughput is 2.38E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612659176674E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 8.7912s - [COUNTERS] Fortran Overhead ( 0 ) : 4.5114s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.2799s for 8192 events => throughput is 1.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 7.0168s + [COUNTERS] Fortran Overhead ( 0 ) : 3.5696s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.4472s for 8192 events => throughput is 2.38E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438704534934E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 53.4090s - [COUNTERS] Fortran Overhead ( 0 ) : 6.1734s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.2356s for 90112 events => throughput is 1.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 42.7588s + [COUNTERS] Fortran Overhead ( 0 ) : 4.7366s + [COUNTERS] CudaCpp MEs ( 2 ) : 38.0222s for 90112 events => throughput is 2.37E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438704534934E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.968066e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.463868e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.968245e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.464521e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612692816703E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 4.7232s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5040s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2192s for 8192 events => throughput is 3.69E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.7820s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9818s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8002s for 8192 events => throughput is 4.55E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438707226035E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 28.6711s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1739s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.4972s for 90112 events => throughput is 3.68E+03 events/s + [COUNTERS] PROGRAM TOTAL : 22.9813s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1541s + [COUNTERS] CudaCpp MEs ( 2 ) : 19.8273s for 90112 events => throughput is 4.54E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438707226035E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.727620e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.685449e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.685802e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.685208e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.2625s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2738s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9887s for 8192 events => throughput is 8.29E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.6443s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9281s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7163s for 8192 events => throughput is 1.14E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 13.6031s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9396s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.6635s for 90112 events => throughput is 8.45E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.9868s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0991s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.8878s for 90112 events => throughput is 1.14E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.715236e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.175286e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.685374e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.174859e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.0253s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1676s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8577s for 8192 events => throughput is 9.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.5228s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8674s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6553s for 8192 events => throughput is 1.25E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 12.2295s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8222s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4074s for 90112 events => throughput is 9.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.2502s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0335s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.2167s for 90112 events => throughput is 1.25E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.886999e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.286275e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.910216e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.287072e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 2.4883s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4086s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0796s for 8192 events => throughput is 7.59E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.0514s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6358s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4156s for 8192 events => throughput is 1.97E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 15.1764s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0860s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.0904s for 90112 events => throughput is 7.45E+03 events/s + [COUNTERS] PROGRAM TOTAL : 6.4259s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8132s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6127s for 90112 events => throughput is 1.95E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.643781e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.016833e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.679757e+03 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000387 [3.8703612512203166E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8696s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8376s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0321s for 8192 events => throughput is 2.56E+05 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612512203166E-004) differ by less than 2E-4 (5.427946980773868e-11) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0001579 [1.5793438642387717E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 2.8559s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5071s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3489s for 90112 events => throughput is 2.58E+05 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642387717E-004) differ by less than 2E-4 (4.051980972974434e-12) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.289596e+05 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.528638e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.112086e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.149032e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.114551e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.167728e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.109912e+05 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.430504e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.019820e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 6ba33cd625..266d51ffaa 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=512y + +make USEBUILDDIR=1 AVX=512z -make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y - -make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_03:53:01 +DATE: 2024-03-01_19:47:22 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.0689s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5400s - [COUNTERS] Fortran MEs ( 1 ) : 95.5289s for 8192 events => throughput is 8.58E+01 events/s + [COUNTERS] PROGRAM TOTAL : 84.8630s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3251s + [COUNTERS] Fortran MEs ( 1 ) : 84.5379s for 8192 events => throughput is 9.69E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.2818s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4823s - [COUNTERS] Fortran MEs ( 1 ) : 95.7994s for 8192 events => throughput is 8.55E+01 events/s + [COUNTERS] PROGRAM TOTAL : 84.8695s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3267s + [COUNTERS] Fortran MEs ( 1 ) : 84.5428s for 8192 events => throughput is 9.69E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1058.3505s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1547s - [COUNTERS] Fortran MEs ( 1 ) : 1054.1958s for 90112 events => throughput is 8.55E+01 events/s + [COUNTERS] PROGRAM TOTAL : 936.4463s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8572s + [COUNTERS] Fortran MEs ( 1 ) : 933.5891s for 90112 events => throughput is 9.65E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 212.3366s - [COUNTERS] Fortran Overhead ( 0 ) : 99.0477s - [COUNTERS] CudaCpp MEs ( 2 ) : 113.2889s for 8192 events => throughput is 7.23E+01 events/s + [COUNTERS] PROGRAM TOTAL : 177.9314s + [COUNTERS] Fortran Overhead ( 0 ) : 81.6750s + [COUNTERS] CudaCpp MEs ( 2 ) : 96.2564s for 8192 events => throughput is 8.51E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1356.0370s - [COUNTERS] Fortran Overhead ( 0 ) : 104.1787s - [COUNTERS] CudaCpp MEs ( 2 ) : 1251.8583s for 90112 events => throughput is 7.20E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1143.0884s + [COUNTERS] Fortran Overhead ( 0 ) : 84.1796s + [COUNTERS] CudaCpp MEs ( 2 ) : 1058.9088s for 90112 events => throughput is 8.51E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.154156e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.008863e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.197434e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.009325e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939197E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 107.3498s - [COUNTERS] Fortran Overhead ( 0 ) : 49.5738s - [COUNTERS] CudaCpp MEs ( 2 ) : 57.7759s for 8192 events => throughput is 1.42E+02 events/s + [COUNTERS] PROGRAM TOTAL : 93.6752s + [COUNTERS] Fortran Overhead ( 0 ) : 43.4234s + [COUNTERS] CudaCpp MEs ( 2 ) : 50.2518s for 8192 events => throughput is 1.63E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656017E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 690.9132s - [COUNTERS] Fortran Overhead ( 0 ) : 53.4647s - [COUNTERS] CudaCpp MEs ( 2 ) : 637.4485s for 90112 events => throughput is 1.41E+02 events/s + [COUNTERS] PROGRAM TOTAL : 596.3770s + [COUNTERS] Fortran Overhead ( 0 ) : 45.9446s + [COUNTERS] CudaCpp MEs ( 2 ) : 550.4324s for 90112 events => throughput is 1.64E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656017E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.672791e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.905031e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.670748e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.906475e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 49.9431s - [COUNTERS] Fortran Overhead ( 0 ) : 23.2154s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.7277s for 8192 events => throughput is 3.06E+02 events/s + [COUNTERS] PROGRAM TOTAL : 42.5980s + [COUNTERS] Fortran Overhead ( 0 ) : 19.5764s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.0216s for 8192 events => throughput is 3.56E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 318.2044s - [COUNTERS] Fortran Overhead ( 0 ) : 26.8024s - [COUNTERS] CudaCpp MEs ( 2 ) : 291.4019s for 90112 events => throughput is 3.09E+02 events/s + [COUNTERS] PROGRAM TOTAL : 276.4895s + [COUNTERS] Fortran Overhead ( 0 ) : 22.0996s + [COUNTERS] CudaCpp MEs ( 2 ) : 254.3899s for 90112 events => throughput is 3.54E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.618074e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.257001e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.618894e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.254807e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 44.2064s - [COUNTERS] Fortran Overhead ( 0 ) : 20.3467s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.8597s for 8192 events => throughput is 3.43E+02 events/s + [COUNTERS] PROGRAM TOTAL : 37.6513s + [COUNTERS] Fortran Overhead ( 0 ) : 17.0538s + [COUNTERS] CudaCpp MEs ( 2 ) : 20.5975s for 8192 events => throughput is 3.98E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 291.1048s - [COUNTERS] Fortran Overhead ( 0 ) : 24.2318s - [COUNTERS] CudaCpp MEs ( 2 ) : 266.8729s for 90112 events => throughput is 3.38E+02 events/s + [COUNTERS] PROGRAM TOTAL : 246.2869s + [COUNTERS] Fortran Overhead ( 0 ) : 19.5567s + [COUNTERS] CudaCpp MEs ( 2 ) : 226.7302s for 90112 events => throughput is 3.97E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.097914e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.902541e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.125731e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.907357e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 45.8566s - [COUNTERS] Fortran Overhead ( 0 ) : 22.2857s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.5710s for 8192 events => throughput is 3.48E+02 events/s + [COUNTERS] PROGRAM TOTAL : 21.2887s + [COUNTERS] Fortran Overhead ( 0 ) : 9.9280s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.3607s for 8192 events => throughput is 7.21E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 285.1342s - [COUNTERS] Fortran Overhead ( 0 ) : 26.2120s - [COUNTERS] CudaCpp MEs ( 2 ) : 258.9222s for 90112 events => throughput is 3.48E+02 events/s + [COUNTERS] PROGRAM TOTAL : 136.9333s + [COUNTERS] Fortran Overhead ( 0 ) : 12.3995s + [COUNTERS] CudaCpp MEs ( 2 ) : 124.5338s for 90112 events => throughput is 7.24E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.725410e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.582235e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.772387e+02 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 4.2510s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1660s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0850s for 8192 events => throughput is 7.55E+03 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.7763568394002505e-15) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 18.8198s - [COUNTERS] Fortran Overhead ( 0 ) : 6.9183s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9015s for 90112 events => throughput is 7.57E+03 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.7763568394002505e-15) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.527080e+03 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.239391e+03 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.271267e+03 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.600243e+03 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.245889e+03 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.476521e+03 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.229131e+03 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.234312e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.580960e+02 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 2b7ca2c190..0c152a6098 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y +make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_05:18:49 +DATE: 2024-03-01_20:57:42 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.8320s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s - [COUNTERS] Fortran MEs ( 1 ) : 96.3581s for 8192 events => throughput is 8.50E+01 events/s + [COUNTERS] PROGRAM TOTAL : 84.8831s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3259s + [COUNTERS] Fortran MEs ( 1 ) : 84.5572s for 8192 events => throughput is 9.69E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.1294s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4800s - [COUNTERS] Fortran MEs ( 1 ) : 95.6494s for 8192 events => throughput is 8.56E+01 events/s + [COUNTERS] PROGRAM TOTAL : 85.4196s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3272s + [COUNTERS] Fortran MEs ( 1 ) : 85.0923s for 8192 events => throughput is 9.63E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1058.3011s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1783s - [COUNTERS] Fortran MEs ( 1 ) : 1054.1228s for 90112 events => throughput is 8.55E+01 events/s + [COUNTERS] PROGRAM TOTAL : 934.8129s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8577s + [COUNTERS] Fortran MEs ( 1 ) : 931.9553s for 90112 events => throughput is 9.67E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,21 +126,21 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405719957040752E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405719950940886E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 197.7089s - [COUNTERS] Fortran Overhead ( 0 ) : 90.3714s - [COUNTERS] CudaCpp MEs ( 2 ) : 107.3375s for 8192 events => throughput is 7.63E+01 events/s + [COUNTERS] PROGRAM TOTAL : 172.1966s + [COUNTERS] Fortran Overhead ( 0 ) : 79.1644s + [COUNTERS] CudaCpp MEs ( 2 ) : 93.0322s for 8192 events => throughput is 8.81E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719957040752E-006) differ by less than 4E-4 (0.00013985256106807675) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719950940886E-006) differ by less than 4E-4 (0.00013985206930144933) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -159,35 +159,35 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326290771198648E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326290797495657E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1274.0074s - [COUNTERS] Fortran Overhead ( 0 ) : 94.0944s - [COUNTERS] CudaCpp MEs ( 2 ) : 1179.9131s for 90112 events => throughput is 7.64E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1105.6235s + [COUNTERS] Fortran Overhead ( 0 ) : 81.6815s + [COUNTERS] CudaCpp MEs ( 2 ) : 1023.9420s for 90112 events => throughput is 8.80E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290771198648E-007) differ by less than 4E-4 (0.00014139199589124907) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290797495657E-007) differ by less than 4E-4 (0.0001413931234055532) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.108865e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.041120e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.128078e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.040723e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405717007921116E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 49.6519s - [COUNTERS] Fortran Overhead ( 0 ) : 23.3946s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.2573s for 8192 events => throughput is 3.12E+02 events/s + [COUNTERS] PROGRAM TOTAL : 42.2598s + [COUNTERS] Fortran Overhead ( 0 ) : 19.7765s + [COUNTERS] CudaCpp MEs ( 2 ) : 22.4833s for 8192 events => throughput is 3.64E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326284900828787E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 315.8806s - [COUNTERS] Fortran Overhead ( 0 ) : 27.1593s - [COUNTERS] CudaCpp MEs ( 2 ) : 288.7213s for 90112 events => throughput is 3.12E+02 events/s + [COUNTERS] PROGRAM TOTAL : 269.6254s + [COUNTERS] Fortran Overhead ( 0 ) : 22.2992s + [COUNTERS] CudaCpp MEs ( 2 ) : 247.3262s for 90112 events => throughput is 3.64E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326284900828787E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.581780e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.224551e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.565199e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.222429e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 25.4788s - [COUNTERS] Fortran Overhead ( 0 ) : 11.8981s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.5807s for 8192 events => throughput is 6.03E+02 events/s + [COUNTERS] PROGRAM TOTAL : 21.5804s + [COUNTERS] Fortran Overhead ( 0 ) : 10.0211s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.5593s for 8192 events => throughput is 7.09E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 165.7549s - [COUNTERS] Fortran Overhead ( 0 ) : 15.4780s - [COUNTERS] CudaCpp MEs ( 2 ) : 150.2769s for 90112 events => throughput is 6.00E+02 events/s + [COUNTERS] PROGRAM TOTAL : 139.7107s + [COUNTERS] Fortran Overhead ( 0 ) : 12.5414s + [COUNTERS] CudaCpp MEs ( 2 ) : 127.1693s for 90112 events => throughput is 7.09E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.259920e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.465406e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.259066e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.459684e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 22.3180s - [COUNTERS] Fortran Overhead ( 0 ) : 10.3786s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9393s for 8192 events => throughput is 6.86E+02 events/s + [COUNTERS] PROGRAM TOTAL : 19.0276s + [COUNTERS] Fortran Overhead ( 0 ) : 8.7312s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.2964s for 8192 events => throughput is 7.96E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 145.4310s - [COUNTERS] Fortran Overhead ( 0 ) : 14.1732s - [COUNTERS] CudaCpp MEs ( 2 ) : 131.2578s for 90112 events => throughput is 6.87E+02 events/s + [COUNTERS] PROGRAM TOTAL : 124.5091s + [COUNTERS] Fortran Overhead ( 0 ) : 11.2582s + [COUNTERS] CudaCpp MEs ( 2 ) : 113.2509s for 90112 events => throughput is 7.96E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.296906e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.781035e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.301383e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.759033e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405719306052570E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 23.0558s - [COUNTERS] Fortran Overhead ( 0 ) : 11.3644s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.6914s for 8192 events => throughput is 7.01E+02 events/s + [COUNTERS] PROGRAM TOTAL : 10.7428s + [COUNTERS] Fortran Overhead ( 0 ) : 5.1100s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.6329s for 8192 events => throughput is 1.45E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326283660088769E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 144.1559s - [COUNTERS] Fortran Overhead ( 0 ) : 15.2893s - [COUNTERS] CudaCpp MEs ( 2 ) : 128.8666s for 90112 events => throughput is 6.99E+02 events/s + [COUNTERS] PROGRAM TOTAL : 69.5332s + [COUNTERS] Fortran Overhead ( 0 ) : 7.6179s + [COUNTERS] CudaCpp MEs ( 2 ) : 61.9153s for 90112 events => throughput is 1.46E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326283660088769E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.554413e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.717335e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.557969e+02 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405722175509512E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 2.4934s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9950s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4985s for 8192 events => throughput is 1.64E+04 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405722175509512E-006) differ by less than 4E-4 (0.00014003141235829908) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326296967941821E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 11.1120s - [COUNTERS] Fortran Overhead ( 0 ) : 5.7089s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4031s for 90112 events => throughput is 1.67E+04 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326296967941821E-007) differ by less than 4E-4 (0.0001416576883412901) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.650610e+04 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.632591e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.339184e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.373598e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.323596e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.361104e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.325481e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.425348e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.719770e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 99d7cfbcd5..986ca889e8 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 - -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + + +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_06:24:34 +DATE: 2024-03-01_21:53:07 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.2156s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s - [COUNTERS] Fortran MEs ( 1 ) : 95.7357s for 8192 events => throughput is 8.56E+01 events/s + [COUNTERS] PROGRAM TOTAL : 85.2035s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3266s + [COUNTERS] Fortran MEs ( 1 ) : 84.8769s for 8192 events => throughput is 9.65E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 96.1318s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s - [COUNTERS] Fortran MEs ( 1 ) : 95.6519s for 8192 events => throughput is 8.56E+01 events/s + [COUNTERS] PROGRAM TOTAL : 84.8113s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3270s + [COUNTERS] Fortran MEs ( 1 ) : 84.4843s for 8192 events => throughput is 9.70E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1057.5728s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1537s - [COUNTERS] Fortran MEs ( 1 ) : 1053.4191s for 90112 events => throughput is 8.55E+01 events/s + [COUNTERS] PROGRAM TOTAL : 937.5305s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8561s + [COUNTERS] Fortran MEs ( 1 ) : 934.6744s for 90112 events => throughput is 9.64E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985299359844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 220.4361s - [COUNTERS] Fortran Overhead ( 0 ) : 102.4490s - [COUNTERS] CudaCpp MEs ( 2 ) : 117.9870s for 8192 events => throughput is 6.94E+01 events/s + [COUNTERS] PROGRAM TOTAL : 180.7446s + [COUNTERS] Fortran Overhead ( 0 ) : 83.3239s + [COUNTERS] CudaCpp MEs ( 2 ) : 97.4208s for 8192 events => throughput is 8.41E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1422.8276s - [COUNTERS] Fortran Overhead ( 0 ) : 106.0198s - [COUNTERS] CudaCpp MEs ( 2 ) : 1316.8079s for 90112 events => throughput is 6.84E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1158.9873s + [COUNTERS] Fortran Overhead ( 0 ) : 85.8624s + [COUNTERS] CudaCpp MEs ( 2 ) : 1073.1249s for 90112 events => throughput is 8.40E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993212353001E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.035940e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.891814e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.018960e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.885638e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985295828471E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 110.5022s - [COUNTERS] Fortran Overhead ( 0 ) : 50.8167s - [COUNTERS] CudaCpp MEs ( 2 ) : 59.6855s for 8192 events => throughput is 1.37E+02 events/s + [COUNTERS] PROGRAM TOTAL : 90.4857s + [COUNTERS] Fortran Overhead ( 0 ) : 41.7615s + [COUNTERS] CudaCpp MEs ( 2 ) : 48.7242s for 8192 events => throughput is 1.68E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222645653E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 715.3882s - [COUNTERS] Fortran Overhead ( 0 ) : 54.5501s - [COUNTERS] CudaCpp MEs ( 2 ) : 660.8381s for 90112 events => throughput is 1.36E+02 events/s + [COUNTERS] PROGRAM TOTAL : 580.2521s + [COUNTERS] Fortran Overhead ( 0 ) : 44.3181s + [COUNTERS] CudaCpp MEs ( 2 ) : 535.9340s for 90112 events => throughput is 1.68E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222645653E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.628879e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.980238e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.636164e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.979425e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 48.5744s - [COUNTERS] Fortran Overhead ( 0 ) : 22.1801s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.3943s for 8192 events => throughput is 3.10E+02 events/s + [COUNTERS] PROGRAM TOTAL : 40.8990s + [COUNTERS] Fortran Overhead ( 0 ) : 18.5391s + [COUNTERS] CudaCpp MEs ( 2 ) : 22.3599s for 8192 events => throughput is 3.66E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 319.2663s - [COUNTERS] Fortran Overhead ( 0 ) : 26.0078s - [COUNTERS] CudaCpp MEs ( 2 ) : 293.2585s for 90112 events => throughput is 3.07E+02 events/s + [COUNTERS] PROGRAM TOTAL : 267.2159s + [COUNTERS] Fortran Overhead ( 0 ) : 21.0817s + [COUNTERS] CudaCpp MEs ( 2 ) : 246.1343s for 90112 events => throughput is 3.66E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.764546e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.500304e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.773101e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.493152e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 42.4540s - [COUNTERS] Fortran Overhead ( 0 ) : 19.2743s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.1797s for 8192 events => throughput is 3.53E+02 events/s + [COUNTERS] PROGRAM TOTAL : 36.1935s + [COUNTERS] Fortran Overhead ( 0 ) : 16.2965s + [COUNTERS] CudaCpp MEs ( 2 ) : 19.8969s for 8192 events => throughput is 4.12E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 277.3470s - [COUNTERS] Fortran Overhead ( 0 ) : 22.9193s - [COUNTERS] CudaCpp MEs ( 2 ) : 254.4277s for 90112 events => throughput is 3.54E+02 events/s + [COUNTERS] PROGRAM TOTAL : 237.7664s + [COUNTERS] Fortran Overhead ( 0 ) : 18.8191s + [COUNTERS] CudaCpp MEs ( 2 ) : 218.9473s for 90112 events => throughput is 4.12E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.384820e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.129109e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.391539e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.122337e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 45.2143s - [COUNTERS] Fortran Overhead ( 0 ) : 21.9553s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.2589s for 8192 events => throughput is 3.52E+02 events/s + [COUNTERS] PROGRAM TOTAL : 20.7508s + [COUNTERS] Fortran Overhead ( 0 ) : 9.5720s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.1788s for 8192 events => throughput is 7.33E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 278.0679s - [COUNTERS] Fortran Overhead ( 0 ) : 25.4000s - [COUNTERS] CudaCpp MEs ( 2 ) : 252.6680s for 90112 events => throughput is 3.57E+02 events/s + [COUNTERS] PROGRAM TOTAL : 133.3425s + [COUNTERS] Fortran Overhead ( 0 ) : 12.0919s + [COUNTERS] CudaCpp MEs ( 2 ) : 121.2506s for 90112 events => throughput is 7.43E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.828727e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.875881e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.858416e+02 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 3.5884s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7239s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8645s for 8192 events => throughput is 9.48E+03 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985217419736E-006) differ by less than 2E-4 (8.480691704448873e-10) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.332e-07 [2.3322993078576733E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 15.9902s - [COUNTERS] Fortran Overhead ( 0 ) : 6.4881s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.5020s for 90112 events => throughput is 9.48E+03 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993078576733E-007) differ by less than 2E-4 (3.464063480507207e-10) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.411937e+03 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.083264e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.112113e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.161038e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111465e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.105445e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.112837e+04 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.656493e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.878334e+02 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 8e9ad5ba7a..b84371ad1d 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_03:51:32 +DATE: 2024-03-01_19:46:46 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4944s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s - [COUNTERS] Fortran MEs ( 1 ) : 0.0697s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3129s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2585s + [COUNTERS] Fortran MEs ( 1 ) : 0.0543s for 8192 events => throughput is 1.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3864s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3169s - [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2739s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2195s + [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.2522s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4897s - [COUNTERS] Fortran MEs ( 1 ) : 0.7625s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6236s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0286s + [COUNTERS] Fortran MEs ( 1 ) : 0.5950s for 90112 events => throughput is 1.51E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4681s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3922s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0759s for 8192 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3360s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2765s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0595s for 8192 events => throughput is 1.38E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561293] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.3698s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5419s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8279s for 90112 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7527s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0953s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6574s for 90112 events => throughput is 1.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561293) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.084897e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.383515e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.103096e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.408034e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351262530] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4004s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3592s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2799s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2477s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0322s for 8192 events => throughput is 2.54E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561281] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9658s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5172s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4486s for 90112 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4221s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0684s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3537s for 90112 events => throughput is 2.55E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561281) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.019219e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.533732e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.018294e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.596028e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3643s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3408s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2515s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2340s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0175s for 8192 events => throughput is 4.68E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7585s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4994s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2591s for 90112 events => throughput is 3.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2454s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0534s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1920s for 90112 events => throughput is 4.69E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.297018e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.786503e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.427747e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.821003e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3848s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3623s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0225s for 8192 events => throughput is 3.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2476s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2309s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0167s for 8192 events => throughput is 4.91E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8132s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5645s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2486s for 90112 events => throughput is 3.62E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2377s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0550s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1827s for 90112 events => throughput is 4.93E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.905513e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.024302e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.866043e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.056314e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3815s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3495s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2452s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2307s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0145s for 8192 events => throughput is 5.65E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8893s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5364s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3529s for 90112 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2125s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0534s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1591s for 90112 events => throughput is 5.66E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.640953e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.561724e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.543334e+05 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539351263363] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7465s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7458s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263363) differ by less than 3E-14 (1.3322676295501878e-15) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686556561304] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9068s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8993s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.20E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561304) differ by less than 3E-14 (4.440892098500626e-16) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.589846e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.058801e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.383441e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.512285e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.382616e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.771039e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.376307e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.776386e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.701897e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 63166c80e0..f8f26accf9 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + + +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_03:52:02 +DATE: 2024-03-01_19:46:58 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4536s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3837s - [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3136s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2586s + [COUNTERS] Fortran MEs ( 1 ) : 0.0550s for 8192 events => throughput is 1.49E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3907s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3210s - [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2674s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2131s + [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.2714s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5019s - [COUNTERS] Fortran MEs ( 1 ) : 0.7695s for 90112 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6288s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0330s + [COUNTERS] Fortran MEs ( 1 ) : 0.5958s for 90112 events => throughput is 1.51E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110463093540638] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4586s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3882s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0704s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3315s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2765s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0550s for 8192 events => throughput is 1.49E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686273216112] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.3150s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5373s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7777s for 90112 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6971s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0917s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6054s for 90112 events => throughput is 1.49E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686273216112) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.170698e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.519656e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.161745e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.508595e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110459152958460] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3657s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3405s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0252s for 8192 events => throughput is 3.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2550s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2350s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0200s for 8192 events => throughput is 4.10E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510683016166510] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7697s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4943s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2753s for 90112 events => throughput is 3.27E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2758s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0569s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2188s for 90112 events => throughput is 4.12E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510683016166510) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.219045e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.011797e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.229652e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.085942e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3421s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3299s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2343s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2249s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6208s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4844s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1364s for 90112 events => throughput is 6.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1488s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0448s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1040s for 90112 events => throughput is 8.67E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.431027e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.884332e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.412727e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.982873e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3415s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3300s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0116s for 8192 events => throughput is 7.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2332s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2243s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0090s for 8192 events => throughput is 9.13E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6084s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1253s for 90112 events => throughput is 7.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1470s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0443s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1027s for 90112 events => throughput is 8.77E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.891581e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.431807e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.928440e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.514860e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110464176080312] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3483s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3329s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2312s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2239s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510685411522326] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6561s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4840s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1721s for 90112 events => throughput is 5.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1246s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0440s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0806s for 90112 events => throughput is 1.12E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510685411522326) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.988554e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.130278e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.962392e+05 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110478167944563] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7423s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7418s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110478167944563) differ by less than 4E-4 (2.2568093527297606e-06) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510689885789414] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8968s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8910s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.53E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510689885789414) differ by less than 4E-4 (1.547708907700951e-07) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.824058e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.473484e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.891145e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.706092e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.798334e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.787777e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.356687e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.028611e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.145791e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index eb4ca92d13..a0472a3076 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=512y +make USEBUILDDIR=1 AVX=sse4 + +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_03:52:30 +DATE: 2024-03-01_19:47:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.4522s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3828s - [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2578s + [COUNTERS] Fortran MEs ( 1 ) : 0.0549s for 8192 events => throughput is 1.49E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3858s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3164s - [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2697s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2153s + [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.2499s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4895s - [COUNTERS] Fortran MEs ( 1 ) : 0.7604s for 90112 events => throughput is 1.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6293s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0334s + [COUNTERS] Fortran MEs ( 1 ) : 0.5959s for 90112 events => throughput is 1.51E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4694s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3943s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3346s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2749s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0596s for 8192 events => throughput is 1.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 2.4601s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6015s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8586s for 90112 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7529s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0996s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6533s for 90112 events => throughput is 1.38E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794337) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.100770e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.399420e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.090853e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.416273e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3940s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0394s for 8192 events => throughput is 2.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2792s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2479s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0314s for 8192 events => throughput is 2.61E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9359s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5057s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4302s for 90112 events => throughput is 2.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4119s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3440s for 90112 events => throughput is 2.62E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794334) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.020468e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.596110e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.027641e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.619319e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3636s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3406s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2501s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2330s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.78E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7468s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4923s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2545s for 90112 events => throughput is 3.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2416s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0536s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1880s for 90112 events => throughput is 4.79E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.536848e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.893459e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.536744e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.921901e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3573s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3372s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0201s for 8192 events => throughput is 4.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2482s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2320s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0162s for 8192 events => throughput is 5.06E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7304s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5047s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2257s for 90112 events => throughput is 3.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2303s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0527s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1776s for 90112 events => throughput is 5.07E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.887668e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.182586e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.834847e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.219883e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.4046s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3689s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 8192 events => throughput is 2.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2489s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2335s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.34E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [OPENMPTH] omp_get_max_threads/nproc = 1/32 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.9542s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5763s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3779s for 90112 events => throughput is 2.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2187s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0503s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1684s for 90112 events => throughput is 5.35E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,119 +484,13 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.510568e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.247590e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.320811e+05 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2711 [0.27110539343558537] fbridge_mode=1 - [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.7473s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7466s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539343558537) differ by less than 2E-4 (2.8419910869104115e-10) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1 - [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.8944s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8868s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.19E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686553631395) differ by less than 2E-4 (1.3620671257541517e-10) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.579519e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.134868e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.391789e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.511629e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.394001e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.800973e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.396936e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.776316e+07 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.440729e+05 ) sec^-1 TEST COMPLETED From 4b09db3bf858e6dfcf9dc51061a0c22493051b76 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 18:52:45 +0100 Subject: [PATCH 92/96] [susy2] go back to itscrd90 test logs git checkout f4d951c7ddfc635707c14e0fe5a0628fd4aec0ac tput/logs_* tmad/logs_* --- .../log_eemumu_mad_d_inl0_hrd0.txt | 308 +++++++++++------ .../log_eemumu_mad_f_inl0_hrd0.txt | 310 +++++++++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 306 +++++++++++------ .../log_ggtt_mad_d_inl0_hrd0.txt | 308 +++++++++++------ .../log_ggtt_mad_f_inl0_hrd0.txt | 306 +++++++++++------ .../log_ggtt_mad_m_inl0_hrd0.txt | 304 +++++++++++------ .../log_ggttg_mad_d_inl0_hrd0.txt | 306 +++++++++++------ .../log_ggttg_mad_f_inl0_hrd0.txt | 312 +++++++++++------ .../log_ggttg_mad_m_inl0_hrd0.txt | 308 +++++++++++------ .../log_ggttgg_mad_d_inl0_hrd0.txt | 306 +++++++++++------ .../log_ggttgg_mad_f_inl0_hrd0.txt | 308 +++++++++++------ .../log_ggttgg_mad_m_inl0_hrd0.txt | 310 +++++++++++------ .../log_ggttggg_mad_d_inl0_hrd0.txt | 312 +++++++++++------ .../log_ggttggg_mad_f_inl0_hrd0.txt | 316 ++++++++++++------ .../log_ggttggg_mad_m_inl0_hrd0.txt | 310 +++++++++++------ .../log_gqttq_mad_d_inl0_hrd0.txt | 310 +++++++++++------ .../log_gqttq_mad_f_inl0_hrd0.txt | 312 +++++++++++------ .../log_gqttq_mad_m_inl0_hrd0.txt | 310 +++++++++++------ .../log_eemumu_mad_d_inl0_hrd0.txt | 215 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 223 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_common.txt | 195 ++++++----- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 190 +++++++---- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 217 ++++++------ .../log_eemumu_mad_d_inl0_hrd1.txt | 215 ++++++------ .../log_eemumu_mad_d_inl1_hrd0.txt | 215 ++++++------ .../log_eemumu_mad_d_inl1_hrd1.txt | 215 ++++++------ .../log_eemumu_mad_f_inl0_hrd0.txt | 215 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 223 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_common.txt | 195 ++++++----- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 190 +++++++---- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 217 ++++++------ .../log_eemumu_mad_f_inl0_hrd1.txt | 215 ++++++------ .../log_eemumu_mad_f_inl1_hrd0.txt | 215 ++++++------ .../log_eemumu_mad_f_inl1_hrd1.txt | 215 ++++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 215 ++++++------ .../log_eemumu_mad_m_inl0_hrd1.txt | 215 ++++++------ .../log_ggtt_mad_d_inl0_hrd0.txt | 217 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 225 +++++++------ .../log_ggtt_mad_d_inl0_hrd0_common.txt | 197 ++++++----- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 192 +++++++---- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 219 ++++++------ .../log_ggtt_mad_d_inl0_hrd1.txt | 217 ++++++------ .../log_ggtt_mad_d_inl1_hrd0.txt | 215 ++++++------ .../log_ggtt_mad_d_inl1_hrd1.txt | 215 ++++++------ .../log_ggtt_mad_f_inl0_hrd0.txt | 217 ++++++------ .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 225 +++++++------ .../log_ggtt_mad_f_inl0_hrd0_common.txt | 197 ++++++----- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 192 +++++++---- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 219 ++++++------ .../log_ggtt_mad_f_inl0_hrd1.txt | 217 ++++++------ .../log_ggtt_mad_f_inl1_hrd0.txt | 215 ++++++------ .../log_ggtt_mad_f_inl1_hrd1.txt | 215 ++++++------ .../log_ggtt_mad_m_inl0_hrd0.txt | 217 ++++++------ .../log_ggtt_mad_m_inl0_hrd1.txt | 217 ++++++------ .../log_ggttg_mad_d_inl0_hrd0.txt | 231 +++++++------ .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_ggttg_mad_d_inl0_hrd1.txt | 231 +++++++------ .../log_ggttg_mad_f_inl0_hrd0.txt | 231 +++++++------ .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_ggttg_mad_f_inl0_hrd1.txt | 231 +++++++------ .../log_ggttg_mad_m_inl0_hrd0.txt | 233 +++++++------ .../log_ggttg_mad_m_inl0_hrd1.txt | 233 +++++++------ .../log_ggttgg_mad_d_inl0_hrd0.txt | 231 +++++++------ .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 211 +++++++----- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 206 ++++++++---- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 234 +++++++------ .../log_ggttgg_mad_d_inl0_hrd1.txt | 231 +++++++------ .../log_ggttgg_mad_d_inl1_hrd0.txt | 231 +++++++------ .../log_ggttgg_mad_d_inl1_hrd1.txt | 231 +++++++------ .../log_ggttgg_mad_f_inl0_hrd0.txt | 231 +++++++------ .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 211 +++++++----- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 206 ++++++++---- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 234 +++++++------ .../log_ggttgg_mad_f_inl0_hrd1.txt | 237 +++++++------ .../log_ggttgg_mad_f_inl1_hrd0.txt | 235 +++++++------ .../log_ggttgg_mad_f_inl1_hrd1.txt | 235 +++++++------ .../log_ggttgg_mad_m_inl0_hrd0.txt | 231 +++++++------ .../log_ggttgg_mad_m_inl0_hrd1.txt | 231 +++++++------ .../log_ggttggg_mad_d_inl0_hrd0.txt | 231 +++++++------ .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_ggttggg_mad_d_inl0_hrd1.txt | 231 +++++++------ .../log_ggttggg_mad_f_inl0_hrd0.txt | 235 +++++++------ .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 247 ++++++++------ .../log_ggttggg_mad_f_inl0_hrd1.txt | 235 +++++++------ .../log_ggttggg_mad_m_inl0_hrd0.txt | 231 +++++++------ .../log_ggttggg_mad_m_inl0_hrd1.txt | 231 +++++++------ .../log_gqttq_mad_d_inl0_hrd0.txt | 231 +++++++------ .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_gqttq_mad_d_inl0_hrd1.txt | 231 +++++++------ .../log_gqttq_mad_f_inl0_hrd0.txt | 231 +++++++------ .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_gqttq_mad_f_inl0_hrd1.txt | 231 +++++++------ .../log_gqttq_mad_m_inl0_hrd0.txt | 231 +++++++------ .../log_gqttq_mad_m_inl0_hrd1.txt | 231 +++++++------ 96 files changed, 13818 insertions(+), 9116 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index a540c52e3f..fb2022a061 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none - +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:08 +DATE: 2024-03-01_03:35:28 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.4698s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4632s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6832s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6748s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.71E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1232s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1166s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1761s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1679s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2640s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1959s - [COUNTERS] Fortran MEs ( 1 ) : 0.0681s for 90112 events => throughput is 1.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3673s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s + [COUNTERS] Fortran MEs ( 1 ) : 0.0874s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1273s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1218s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0056s for 8192 events => throughput is 1.47E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1745s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2606s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2024s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0582s for 90112 events => throughput is 1.55E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3053s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0752s for 90112 events => throughput is 1.20E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.577544e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.174335e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.582008e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.235605e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1234s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1201s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.48E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1874s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.89E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2364s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2023s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0341s for 90112 events => throughput is 2.64E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3298s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2854s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0444s for 90112 events => throughput is 2.03E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.707525e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.003456e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.816473e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.071261e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1205s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1181s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1738s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1708s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.75E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2272s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2006s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0266s for 90112 events => throughput is 3.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3165s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2833s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 90112 events => throughput is 2.71E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.530014e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.590204e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.685057e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.724231e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1203s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1180s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1747s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1718s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.78E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2270s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2014s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 90112 events => throughput is 3.52E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3170s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2851s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0318s for 90112 events => throughput is 2.83E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.623987e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.651963e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.843267e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.888816e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1200s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1182s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.42E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1721s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2195s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2001s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 90112 events => throughput is 4.66E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 90112 events => throughput is 2.28E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.833138e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.333417e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.160270e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.247580e+06 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.5894s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5889s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.7068s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7019s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.143768e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.922192e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.720542e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.434610e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.732238e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.027929e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.748145e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.129848e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 01d3bcd1ad..130936da07 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - - make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:12 +DATE: 2024-03-01_03:35:44 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.4777s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4710s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7004s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6920s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.73E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1239s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1173s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1752s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1674s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2678s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2000s - [COUNTERS] Fortran MEs ( 1 ) : 0.0678s for 90112 events => throughput is 1.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3760s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s + [COUNTERS] Fortran MEs ( 1 ) : 0.0872s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382703205998396E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1242s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1193s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 8192 events => throughput is 1.67E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1795s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1733s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515590123565249E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2527s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2015s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0512s for 90112 events => throughput is 1.76E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3578s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2889s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0690s for 90112 events => throughput is 1.31E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515590123565249E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.824486e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.296058e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.843269e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.289423e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1192s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1173s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1759s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1734s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2180s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1970s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0210s for 90112 events => throughput is 4.30E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3141s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0281s for 90112 events => throughput is 3.21E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587612890761E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.470047e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.247103e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.654667e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.346461e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1210s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1192s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1759s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1735s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.49E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2216s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2017s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0198s for 90112 events => throughput is 4.55E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3181s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 90112 events => throughput is 3.50E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.847882e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.473027e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.058914e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.779574e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1207s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1189s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.54E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1764s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1741s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2203s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2008s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0195s for 90112 events => throughput is 4.62E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2887s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0247s for 90112 events => throughput is 3.65E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.901436e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.393313e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.119507e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.850238e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382704335459282E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1190s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1178s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0012s for 8192 events => throughput is 6.69E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1726s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.46E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515591296252558E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2147s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2012s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0134s for 90112 events => throughput is 6.70E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2901s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0255s for 90112 events => throughput is 3.53E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591296252558E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.099144e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.340689e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.624026e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.795181e+06 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.5865s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5861s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.74E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382706077425631E-002) differ by less than 4E-4 (9.988182347875352e-08) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.7069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7024s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 1.98E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515592892887687E-002) differ by less than 4E-4 (9.973286385633884e-08) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.528794e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.178202e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.848804e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.051133e+09 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.014035e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.222690e+09 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.412951e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.409232e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index aa33fb4f59..da7367ae5e 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:16 +DATE: 2024-03-01_03:36:01 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.4707s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4640s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7189s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7106s + [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1250s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1183s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1780s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1693s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2620s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1946s - [COUNTERS] Fortran MEs ( 1 ) : 0.0674s for 90112 events => throughput is 1.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3702s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2823s + [COUNTERS] Fortran MEs ( 1 ) : 0.0879s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701395E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1278s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1222s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0056s for 8192 events => throughput is 1.45E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1808s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1742s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2613s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2018s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0594s for 90112 events => throughput is 1.52E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3592s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0732s for 90112 events => throughput is 1.23E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.544290e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.182030e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.541211e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.222787e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1220s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1189s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.65E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1746s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.07E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2330s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1999s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 90112 events => throughput is 2.72E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 90112 events => throughput is 2.14E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.764036e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.086150e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.888476e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.131619e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1210s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1185s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1696s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.76E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2266s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2002s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0264s for 90112 events => throughput is 3.42E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3229s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2892s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0338s for 90112 events => throughput is 2.67E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.562659e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.541763e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.728436e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.454900e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1208s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1185s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.57E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1830s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1799s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.61E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2250s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1999s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0251s for 90112 events => throughput is 3.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3167s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2848s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 90112 events => throughput is 2.83E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.748896e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.677035e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.946216e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.872617e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1190s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1172s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0017s for 8192 events => throughput is 4.73E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1722s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.36E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2180s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1992s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0188s for 90112 events => throughput is 4.80E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3270s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2890s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0380s for 90112 events => throughput is 2.37E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.001566e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.248118e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.332952e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.400436e+06 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.5902s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5897s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.66E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715392009194E-002) differ by less than 2E-4 (1.3548906441229747e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602021089631E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.7064s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7016s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.87E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602021089631E-002) differ by less than 2E-4 (1.1898038110302878e-11) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.153365e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.922960e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.732117e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.451486e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.736678e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.069247e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.733211e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.156375e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index f736b14f89..657075d34f 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:20 +DATE: 2024-03-01_03:36:18 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.5241s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4919s - [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8052s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7640s + [COUNTERS] Fortran MEs ( 1 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2693s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2371s - [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3849s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3442s + [COUNTERS] Fortran MEs ( 1 ) : 0.0408s for 8192 events => throughput is 2.01E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2093s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8563s - [COUNTERS] Fortran MEs ( 1 ) : 0.3530s for 90112 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6297s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1833s + [COUNTERS] Fortran MEs ( 1 ) : 0.4464s for 90112 events => throughput is 2.02E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2948s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2654s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0294s for 8192 events => throughput is 2.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4282s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3910s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0373s for 8192 events => throughput is 2.20E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2065s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8827s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3238s for 90112 events => throughput is 2.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6901s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4066s for 90112 events => throughput is 2.22E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.789923e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.207121e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.858469e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.224007e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2706s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2539s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0167s for 8192 events => throughput is 4.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3927s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 8192 events => throughput is 3.78E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.0538s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8701s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1836s for 90112 events => throughput is 4.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4997s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2628s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2369s for 90112 events => throughput is 3.80E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989106) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.827269e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.699229e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.939228e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.772412e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2562s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2462s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0101s for 8192 events => throughput is 8.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.20E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9712s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8607s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1105s for 90112 events => throughput is 8.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4003s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2548s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1455s for 90112 events => throughput is 6.19E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.441330e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.020313e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.515876e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.141769e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2546s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2450s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 8192 events => throughput is 8.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3737s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3616s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.75E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9629s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8596s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1033s for 90112 events => throughput is 8.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3804s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2520s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1283s for 90112 events => throughput is 7.02E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.937684e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.898875e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.116082e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.924828e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2551s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2456s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3919s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3726s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 8192 events => throughput is 4.24E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9646s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8611s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1035s for 90112 events => throughput is 8.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5267s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3034s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2233s for 90112 events => throughput is 4.04E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.551823e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.791161e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.755678e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.782832e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.7828s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7823s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.6782s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6718s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.045663e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.714246e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.010596e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.071675e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.000853e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.152555e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.001515e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.100234e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 235aa30713..eb011c6697 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none @@ -6,37 +6,37 @@ make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:30 +DATE: 2024-03-01_03:36:45 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.5257s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4936s - [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7779s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7373s + [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2688s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2366s - [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3479s + [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2051s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8514s - [COUNTERS] Fortran MEs ( 1 ) : 0.3537s for 90112 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6449s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1962s + [COUNTERS] Fortran MEs ( 1 ) : 0.4487s for 90112 events => throughput is 2.01E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094179780921394] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2903s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2635s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0268s for 8192 events => throughput is 3.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4205s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0345s for 8192 events => throughput is 2.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105688579298537] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1743s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8796s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2947s for 90112 events => throughput is 3.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6592s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2787s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3805s for 90112 events => throughput is 2.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105688579298537) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.132672e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.351307e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.165533e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.338637e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094175850060040] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2584s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2465s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0119s for 8192 events => throughput is 6.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3859s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0148s for 8192 events => throughput is 5.54E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684763984058] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9906s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8610s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1296s for 90112 events => throughput is 6.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4203s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2581s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1622s for 90112 events => throughput is 5.56E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105684763984058) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.797834e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.210465e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.867367e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.317035e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2496s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2435s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0061s for 8192 events => throughput is 1.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3679s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3602s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9271s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8599s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0672s for 90112 events => throughput is 1.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3368s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2515s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0853s for 90112 events => throughput is 1.06E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.382080e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.038889e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.412593e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.040818e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2471s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2414s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0057s for 8192 events => throughput is 1.43E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3685s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3613s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.13E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9222s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8592s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0630s for 90112 events => throughput is 1.43E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3377s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2584s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0793s for 90112 events => throughput is 1.14E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.495407e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.104729e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.513487e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.124265e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094178213275804] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2466s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2415s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 8192 events => throughput is 1.58E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3621s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0105s for 8192 events => throughput is 7.77E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105688407939567] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9147s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8576s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0571s for 90112 events => throughput is 1.58E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3760s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2629s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 90112 events => throughput is 7.97E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105688407939567) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.606554e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.591310e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.630447e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.407728e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184344050284] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.7795s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7789s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.48E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094184344050284) differ by less than 4E-4 (9.761425112664313e-09) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105694586476879] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.7005s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6948s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0057s for 90112 events => throughput is 1.57E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105694586476879) differ by less than 4E-4 (1.4722471020078842e-08) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.201563e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.986974e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.810580e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.774762e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.802177e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.847890e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.368745e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.422351e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 8034ca9ebb..bef66309f6 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none @@ -6,37 +6,37 @@ make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:40 +DATE: 2024-03-01_03:37:12 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.5244s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4923s - [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7917s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7505s + [COUNTERS] Fortran MEs ( 1 ) : 0.0413s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2689s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2362s - [COUNTERS] Fortran MEs ( 1 ) : 0.0326s for 8192 events => throughput is 2.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3956s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] Fortran MEs ( 1 ) : 0.0410s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2077s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8533s - [COUNTERS] Fortran MEs ( 1 ) : 0.3545s for 90112 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6496s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1993s + [COUNTERS] Fortran MEs ( 1 ) : 0.4503s for 90112 events => throughput is 2.00E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2956s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2655s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0300s for 8192 events => throughput is 2.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3880s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0377s for 8192 events => throughput is 2.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2105s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8806s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3299s for 90112 events => throughput is 2.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6990s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2867s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4123s for 90112 events => throughput is 2.19E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006634) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.728057e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.185122e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.785272e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.177902e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2690s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2525s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0165s for 8192 events => throughput is 4.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3989s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3779s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0210s for 8192 events => throughput is 3.91E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.0454s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8647s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1806s for 90112 events => throughput is 4.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5038s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2721s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2317s for 90112 events => throughput is 3.89E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006626) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.062202e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.744718e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.973810e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.796645e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2557s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2457s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3789s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3661s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.40E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9697s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8600s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1097s for 90112 events => throughput is 8.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4021s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2588s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1433s for 90112 events => throughput is 6.29E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.485845e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.012402e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.626997e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.056070e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2541s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2449s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0093s for 8192 events => throughput is 8.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3770s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3652s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0118s for 8192 events => throughput is 6.94E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9619s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8601s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1018s for 90112 events => throughput is 8.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3844s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2567s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1278s for 90112 events => throughput is 7.05E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.108224e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.957699e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.251964e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.976096e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2544s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2450s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3910s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3720s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0190s for 8192 events => throughput is 4.32E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9643s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8623s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1020s for 90112 events => throughput is 8.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4767s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2708s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2059s for 90112 events => throughput is 4.38E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.699461e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.223304e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.028075e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.269412e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.7843s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7837s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.37E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094184798437830) differ by less than 2E-4 (1.1293987967064822e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.6837s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6773s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.40E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279068492) differ by less than 2E-4 (1.954369999168648e-11) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.090244e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.672934e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.997070e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.055834e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.991192e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.134835e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.012024e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.999333e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 36f4d3601a..cd3823dd45 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none - +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:50 +DATE: 2024-03-01_03:37:39 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4918s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2352s - [COUNTERS] Fortran MEs ( 1 ) : 0.2567s for 8192 events => throughput is 3.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6990s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3830s + [COUNTERS] Fortran MEs ( 1 ) : 0.3160s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4671s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2109s - [COUNTERS] Fortran MEs ( 1 ) : 0.2562s for 8192 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3119s + [COUNTERS] Fortran MEs ( 1 ) : 0.3167s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.8468s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0281s - [COUNTERS] Fortran MEs ( 1 ) : 2.8187s for 90112 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.9846s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4835s + [COUNTERS] Fortran MEs ( 1 ) : 3.5010s for 90112 events => throughput is 2.57E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7285s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4659s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2625s for 8192 events => throughput is 3.12E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0129s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6673s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3457s for 8192 events => throughput is 2.37E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 4.1673s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2870s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.8803s for 90112 events => throughput is 3.13E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3959s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8159s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5801s for 90112 events => throughput is 2.52E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.206647e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.608629e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.221936e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.585600e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4770s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3418s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1352s for 8192 events => throughput is 6.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6456s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4771s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1686s for 8192 events => throughput is 4.86E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.6522s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1622s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4900s for 90112 events => throughput is 6.05E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4996s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6435s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8561s for 90112 events => throughput is 4.85E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.254884e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.966202e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.291268e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.937901e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3368s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2730s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0638s for 8192 events => throughput is 1.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4803s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3957s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0846s for 8192 events => throughput is 9.68E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.7929s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0905s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7023s for 90112 events => throughput is 1.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5052s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5758s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9294s for 90112 events => throughput is 9.70E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.314146e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.913999e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.312940e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.849874e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3290s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2694s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0596s for 8192 events => throughput is 1.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4790s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3971s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0818s for 8192 events => throughput is 1.00E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.7376s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0829s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6547s for 90112 events => throughput is 1.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3870s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5613s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8257s for 90112 events => throughput is 1.09E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.414630e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.099230e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.418410e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.125635e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.2981s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2548s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4199s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1084s for 8192 events => throughput is 7.55E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.5391s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4712s for 90112 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7535s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5980s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1555s for 90112 events => throughput is 7.80E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.907849e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.774058e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.952413e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.841638e+04 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.7462s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7408s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749111) differ by less than 3E-14 (0.0) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238481932717736E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9272s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9044s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717736E-002) differ by less than 3E-14 (2.220446049250313e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.632538e+06 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.097542e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.673182e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.241730e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.666883e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.250394e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.680746e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.758368e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 24c504a5ca..b22193f403 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none + +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - - make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:38:12 +DATE: 2024-03-01_03:38:22 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4906s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2344s - [COUNTERS] Fortran MEs ( 1 ) : 0.2562s for 8192 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6628s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3454s + [COUNTERS] Fortran MEs ( 1 ) : 0.3174s for 8192 events => throughput is 2.58E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4674s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2112s - [COUNTERS] Fortran MEs ( 1 ) : 0.2562s for 8192 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3130s + [COUNTERS] Fortran MEs ( 1 ) : 0.3165s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.8381s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0198s - [COUNTERS] Fortran MEs ( 1 ) : 2.8183s for 90112 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.9722s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s + [COUNTERS] Fortran MEs ( 1 ) : 3.4891s for 90112 events => throughput is 2.58E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112722621426752] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7110s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4576s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2534s for 8192 events => throughput is 3.23E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9336s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6187s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3149s for 8192 events => throughput is 2.60E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238468310179624E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 4.0696s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2755s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.7941s for 90112 events => throughput is 3.23E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3385s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8107s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5278s for 90112 events => throughput is 2.55E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238468310179624E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.313377e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.649087e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.325155e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.678753e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112720710186394] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3591s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0757s for 8192 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4987s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4044s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0944s for 8192 events => throughput is 8.68E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238454786658835E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9379s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1108s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8271s for 90112 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5977s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5622s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0354s for 90112 events => throughput is 8.70E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238454786658835E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.114703e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.791493e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.120515e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.818254e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.2750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2419s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4003s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3576s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.4282s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0644s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3638s for 90112 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9911s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4755s for 90112 events => throughput is 1.90E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.549677e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.915431e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.559529e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.928091e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.2701s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2394s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0307s for 8192 events => throughput is 2.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3910s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3509s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0400s for 8192 events => throughput is 2.05E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.3923s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0546s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3377s for 90112 events => throughput is 2.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9540s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5218s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4322s for 90112 events => throughput is 2.09E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.759664e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.113903e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.772533e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.128293e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112723387847480] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.2548s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2344s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0203s for 8192 events => throughput is 4.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3658s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0516s for 8192 events => throughput is 1.59E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238464410949921E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.2671s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0435s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2236s for 90112 events => throughput is 4.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0938s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5607s for 90112 events => throughput is 1.61E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464410949921E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.036698e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.580486e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.079449e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.544942e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112726034625694] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.7476s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7467s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.72E+06 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112726034625694) differ by less than 4E-4 (2.2321452152196386e-06) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238473828077680E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9141s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9047s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 90112 events => throughput is 9.57E+06 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238473828077680E-002) differ by less than 4E-4 (1.0228161673175862e-07) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.317603e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.855249e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.653705e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.471958e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.666794e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.507869e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.515295e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.625829e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 3b804b916f..994bc4f8f2 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 - +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:38:31 +DATE: 2024-03-01_03:39:00 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4914s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2347s - [COUNTERS] Fortran MEs ( 1 ) : 0.2568s for 8192 events => throughput is 3.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3573s + [COUNTERS] Fortran MEs ( 1 ) : 0.3178s for 8192 events => throughput is 2.58E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4663s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2100s - [COUNTERS] Fortran MEs ( 1 ) : 0.2563s for 8192 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3205s + [COUNTERS] Fortran MEs ( 1 ) : 0.3183s for 8192 events => throughput is 2.57E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.8461s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0203s - [COUNTERS] Fortran MEs ( 1 ) : 2.8258s for 90112 events => throughput is 3.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0099s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5073s + [COUNTERS] Fortran MEs ( 1 ) : 3.5026s for 90112 events => throughput is 2.57E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7487s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4785s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2702s for 8192 events => throughput is 3.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9635s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6336s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3299s for 8192 events => throughput is 2.48E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 4.2292s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2873s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.9419s for 90112 events => throughput is 3.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.4154s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7958s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.6196s for 90112 events => throughput is 2.49E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482679400354E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.165447e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.562106e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.152487e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.547562e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748702805033] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4799s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3471s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1328s for 8192 events => throughput is 6.17E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6394s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1654s for 8192 events => throughput is 4.95E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482683055667E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.6010s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1530s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4480s for 90112 events => throughput is 6.22E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4743s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6457s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8285s for 90112 events => throughput is 4.93E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482683055667E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.328779e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.063467e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.375772e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.051938e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3422s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2786s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0635s for 8192 events => throughput is 1.29E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4771s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3935s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 8192 events => throughput is 9.80E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.7929s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0938s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6991s for 90112 events => throughput is 1.29E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4752s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5541s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9211s for 90112 events => throughput is 9.78E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.324352e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.001861e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.329871e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.840887e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3263s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2680s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0583s for 8192 events => throughput is 1.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4581s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3846s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0735s for 8192 events => throughput is 1.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.7235s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0825s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6410s for 90112 events => throughput is 1.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3626s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5499s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8127s for 90112 events => throughput is 1.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.436389e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.148207e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.440980e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.152825e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748700265108] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3043s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2596s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0447s for 8192 events => throughput is 1.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5403s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1136s for 8192 events => throughput is 7.21E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482666076374E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.5642s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0705s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4936s for 90112 events => throughput is 1.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7973s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6097s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1876s for 90112 events => throughput is 7.59E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482666076374E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.857254e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.609614e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.864849e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.592843e+04 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.7459s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7405s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748601943165) differ by less than 2E-4 (5.74121417074025e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9191s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8964s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.96E+06 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481937154381E-002) differ by less than 2E-4 (5.5991211667105745e-11) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.624489e+06 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.862423e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.598562e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.230160e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.604858e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.241022e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.618302e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.712384e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 10cb20bbb1..455a867420 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:38:52 +DATE: 2024-03-01_03:39:43 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 3.6842s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2355s - [COUNTERS] Fortran MEs ( 1 ) : 3.4488s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5262s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3968s + [COUNTERS] Fortran MEs ( 1 ) : 4.1295s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.6715s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2301s - [COUNTERS] Fortran MEs ( 1 ) : 3.4414s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4601s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3420s + [COUNTERS] Fortran MEs ( 1 ) : 4.1180s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 39.6093s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4241s - [COUNTERS] Fortran MEs ( 1 ) : 38.1852s for 90112 events => throughput is 2.36E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.7126s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0517s + [COUNTERS] Fortran MEs ( 1 ) : 45.6608s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 6.9402s - [COUNTERS] Fortran Overhead ( 0 ) : 3.5336s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.4065s for 8192 events => throughput is 2.40E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7056s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4601s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2455s for 8192 events => throughput is 1.93E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 42.2863s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7174s - [COUNTERS] CudaCpp MEs ( 2 ) : 37.5689s for 90112 events => throughput is 2.40E+03 events/s + [COUNTERS] PROGRAM TOTAL : 53.1561s + [COUNTERS] Fortran Overhead ( 0 ) : 6.1171s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.0390s for 90112 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451704E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.479889e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.989312e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.479923e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.975004e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.7209s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9393s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.7816s for 8192 events => throughput is 4.60E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7773s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5170s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2604s for 8192 events => throughput is 3.62E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451701E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 22.6693s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0997s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.5696s for 90112 events => throughput is 4.60E+03 events/s + [COUNTERS] PROGRAM TOTAL : 29.0103s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1559s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.8544s for 90112 events => throughput is 3.63E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451701E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.828014e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.801009e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.821992e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.781734e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.7069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9577s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7492s for 8192 events => throughput is 1.09E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.2569s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2848s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9721s for 8192 events => throughput is 8.43E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 10.3842s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1372s - [COUNTERS] CudaCpp MEs ( 2 ) : 8.2470s for 90112 events => throughput is 1.09E+04 events/s + [COUNTERS] PROGRAM TOTAL : 13.7501s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9426s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.8075s for 90112 events => throughput is 8.34E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.131656e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.607758e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.130829e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.615061e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.5295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8703s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6591s for 8192 events => throughput is 1.24E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.0290s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8583s for 8192 events => throughput is 9.54E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 9.2923s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0408s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.2515s for 90112 events => throughput is 1.24E+04 events/s + [COUNTERS] PROGRAM TOTAL : 12.2922s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8302s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4620s for 90112 events => throughput is 9.52E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.282050e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.701965e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.280247e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.814187e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.0471s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6354s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4117s for 8192 events => throughput is 1.99E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5040s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4024s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1017s for 8192 events => throughput is 7.44E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 6.3290s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8022s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.5269s for 90112 events => throughput is 1.99E+04 events/s + [COUNTERS] PROGRAM TOTAL : 14.7910s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0412s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.7498s for 90112 events => throughput is 7.67E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.029838e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.831586e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.031463e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.821061e+03 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 0.8686s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8368s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.57E+05 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.8233s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4732s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3501s for 90112 events => throughput is 2.57E+05 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.280922e+05 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.518844e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.106750e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.162850e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.106625e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.168282e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.107369e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.430988e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index cf2dcec7e0..5e945a4db8 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - - make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:41:40 +DATE: 2024-03-01_03:43:57 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 3.6839s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2360s - [COUNTERS] Fortran MEs ( 1 ) : 3.4480s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4989s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3403s + [COUNTERS] Fortran MEs ( 1 ) : 4.1586s for 8192 events => throughput is 1.97E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.6716s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2300s - [COUNTERS] Fortran MEs ( 1 ) : 3.4416s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4650s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3366s + [COUNTERS] Fortran MEs ( 1 ) : 4.1284s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 39.2771s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4084s - [COUNTERS] Fortran MEs ( 1 ) : 37.8687s for 90112 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.5707s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0343s + [COUNTERS] Fortran MEs ( 1 ) : 45.5364s for 90112 events => throughput is 1.98E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703729438336302E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 6.7808s - [COUNTERS] Fortran Overhead ( 0 ) : 3.4578s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.3230s for 8192 events => throughput is 2.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.4568s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3045s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1523s for 8192 events => throughput is 1.97E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793486626492658E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 41.2049s - [COUNTERS] Fortran Overhead ( 0 ) : 4.6268s - [COUNTERS] CudaCpp MEs ( 2 ) : 36.5781s for 90112 events => throughput is 2.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 51.1261s + [COUNTERS] Fortran Overhead ( 0 ) : 5.9844s + [COUNTERS] CudaCpp MEs ( 2 ) : 45.1417s for 90112 events => throughput is 2.00E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486626492658E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.540625e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.070377e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.542609e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.032691e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703722581317850E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.8927s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0485s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8442s for 8192 events => throughput is 9.70E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5531s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4379s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1153s for 8192 events => throughput is 7.35E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793483759856148E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 11.5324s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2190s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.3134s for 90112 events => throughput is 9.68E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.4011s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1124s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.2887s for 90112 events => throughput is 7.33E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483759856148E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.994672e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.468143e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.997533e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.493623e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.9673s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5920s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3753s for 8192 events => throughput is 2.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3122s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8184s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4938s for 8192 events => throughput is 1.66E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 5.9096s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7724s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.1372s for 90112 events => throughput is 2.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.8863s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4589s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4274s for 90112 events => throughput is 1.66E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.262354e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.689224e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.259720e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.712522e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8878s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5548s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3330s for 8192 events => throughput is 2.46E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.1887s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7547s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4340s for 8192 events => throughput is 1.89E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 5.3953s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7248s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6705s for 90112 events => throughput is 2.46E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.2113s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4166s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7946s for 90112 events => throughput is 1.88E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.532533e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.812765e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.533837e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.800388e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703728658657426E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.6309s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4278s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2032s for 8192 events => throughput is 4.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.4119s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8827s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5292s for 8192 events => throughput is 1.55E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793486977281547E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 3.8256s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5943s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2313s for 90112 events => throughput is 4.04E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.3753s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5229s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.8525s for 90112 events => throughput is 1.54E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486977281547E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.132811e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.556546e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.135906e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.565832e+04 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703736267486325E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 0.8334s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8120s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.82E+05 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703736267486325E-004) differ by less than 4E-4 (3.1975667371675343e-06) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793489323670813E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.7017s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4654s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2363s for 90112 events => throughput is 3.81E+05 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793489323670813E-004) differ by less than 4E-4 (3.20900471706409e-06) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.592263e+05 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.940482e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.499807e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.638317e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.497540e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.635301e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.483569e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.518477e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index d3fac22074..4a1ef98d00 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none - -make USEBUILDDIR=1 AVX=512y +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:43:57 +DATE: 2024-03-01_03:47:17 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 3.6772s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2339s - [COUNTERS] Fortran MEs ( 1 ) : 3.4433s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4720s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3418s + [COUNTERS] Fortran MEs ( 1 ) : 4.1302s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.6711s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2305s - [COUNTERS] Fortran MEs ( 1 ) : 3.4406s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3357s + [COUNTERS] Fortran MEs ( 1 ) : 4.1229s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 39.2987s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4107s - [COUNTERS] Fortran MEs ( 1 ) : 37.8881s for 90112 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.6222s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0494s + [COUNTERS] Fortran MEs ( 1 ) : 45.5728s for 90112 events => throughput is 1.98E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612659176674E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 7.0168s - [COUNTERS] Fortran Overhead ( 0 ) : 3.5696s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.4472s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7912s + [COUNTERS] Fortran Overhead ( 0 ) : 4.5114s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2799s for 8192 events => throughput is 1.91E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438704534934E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 42.7588s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7366s - [COUNTERS] CudaCpp MEs ( 2 ) : 38.0222s for 90112 events => throughput is 2.37E+03 events/s + [COUNTERS] PROGRAM TOTAL : 53.4090s + [COUNTERS] Fortran Overhead ( 0 ) : 6.1734s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.2356s for 90112 events => throughput is 1.91E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438704534934E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.463868e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.968066e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.464521e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.968245e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612692816703E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.7820s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9818s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8002s for 8192 events => throughput is 4.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7232s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5040s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2192s for 8192 events => throughput is 3.69E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438707226035E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 22.9813s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1541s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.8273s for 90112 events => throughput is 4.54E+03 events/s + [COUNTERS] PROGRAM TOTAL : 28.6711s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1739s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.4972s for 90112 events => throughput is 3.68E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438707226035E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.685449e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.727620e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.685208e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.685802e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.6443s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9281s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7163s for 8192 events => throughput is 1.14E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.2625s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2738s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9887s for 8192 events => throughput is 8.29E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 9.9868s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0991s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.8878s for 90112 events => throughput is 1.14E+04 events/s + [COUNTERS] PROGRAM TOTAL : 13.6031s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9396s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.6635s for 90112 events => throughput is 8.45E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.175286e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.715236e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.174859e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.685374e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.5228s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8674s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6553s for 8192 events => throughput is 1.25E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.0253s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1676s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8577s for 8192 events => throughput is 9.55E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 9.2502s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0335s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.2167s for 90112 events => throughput is 1.25E+04 events/s + [COUNTERS] PROGRAM TOTAL : 12.2295s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8222s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4074s for 90112 events => throughput is 9.58E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.286275e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.886999e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.287072e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.910216e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.0514s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6358s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4156s for 8192 events => throughput is 1.97E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.4883s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4086s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0796s for 8192 events => throughput is 7.59E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 6.4259s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8132s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.6127s for 90112 events => throughput is 1.95E+04 events/s + [COUNTERS] PROGRAM TOTAL : 15.1764s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0860s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.0904s for 90112 events => throughput is 7.45E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.016833e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.643781e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.019820e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.679757e+03 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612512203166E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 0.8696s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8376s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0321s for 8192 events => throughput is 2.56E+05 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612512203166E-004) differ by less than 2E-4 (5.427946980773868e-11) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438642387717E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.8559s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5071s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3489s for 90112 events => throughput is 2.58E+05 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642387717E-004) differ by less than 2E-4 (4.051980972974434e-12) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.289596e+05 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.528638e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.112086e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.149032e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.114551e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.167728e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.109912e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.430504e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 266d51ffaa..6ba33cd625 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=512y - -make USEBUILDDIR=1 AVX=512z +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 AVX=512y + +make USEBUILDDIR=1 AVX=512z +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:47:22 +DATE: 2024-03-01_03:53:01 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 84.8630s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3251s - [COUNTERS] Fortran MEs ( 1 ) : 84.5379s for 8192 events => throughput is 9.69E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.0689s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5400s + [COUNTERS] Fortran MEs ( 1 ) : 95.5289s for 8192 events => throughput is 8.58E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 84.8695s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3267s - [COUNTERS] Fortran MEs ( 1 ) : 84.5428s for 8192 events => throughput is 9.69E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.2818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4823s + [COUNTERS] Fortran MEs ( 1 ) : 95.7994s for 8192 events => throughput is 8.55E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 936.4463s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8572s - [COUNTERS] Fortran MEs ( 1 ) : 933.5891s for 90112 events => throughput is 9.65E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1058.3505s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1547s + [COUNTERS] Fortran MEs ( 1 ) : 1054.1958s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 177.9314s - [COUNTERS] Fortran Overhead ( 0 ) : 81.6750s - [COUNTERS] CudaCpp MEs ( 2 ) : 96.2564s for 8192 events => throughput is 8.51E+01 events/s + [COUNTERS] PROGRAM TOTAL : 212.3366s + [COUNTERS] Fortran Overhead ( 0 ) : 99.0477s + [COUNTERS] CudaCpp MEs ( 2 ) : 113.2889s for 8192 events => throughput is 7.23E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1143.0884s - [COUNTERS] Fortran Overhead ( 0 ) : 84.1796s - [COUNTERS] CudaCpp MEs ( 2 ) : 1058.9088s for 90112 events => throughput is 8.51E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1356.0370s + [COUNTERS] Fortran Overhead ( 0 ) : 104.1787s + [COUNTERS] CudaCpp MEs ( 2 ) : 1251.8583s for 90112 events => throughput is 7.20E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.008863e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.154156e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.009325e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.197434e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939197E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 93.6752s - [COUNTERS] Fortran Overhead ( 0 ) : 43.4234s - [COUNTERS] CudaCpp MEs ( 2 ) : 50.2518s for 8192 events => throughput is 1.63E+02 events/s + [COUNTERS] PROGRAM TOTAL : 107.3498s + [COUNTERS] Fortran Overhead ( 0 ) : 49.5738s + [COUNTERS] CudaCpp MEs ( 2 ) : 57.7759s for 8192 events => throughput is 1.42E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656017E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 596.3770s - [COUNTERS] Fortran Overhead ( 0 ) : 45.9446s - [COUNTERS] CudaCpp MEs ( 2 ) : 550.4324s for 90112 events => throughput is 1.64E+02 events/s + [COUNTERS] PROGRAM TOTAL : 690.9132s + [COUNTERS] Fortran Overhead ( 0 ) : 53.4647s + [COUNTERS] CudaCpp MEs ( 2 ) : 637.4485s for 90112 events => throughput is 1.41E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656017E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.905031e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.672791e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.906475e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.670748e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 42.5980s - [COUNTERS] Fortran Overhead ( 0 ) : 19.5764s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.0216s for 8192 events => throughput is 3.56E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.9431s + [COUNTERS] Fortran Overhead ( 0 ) : 23.2154s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.7277s for 8192 events => throughput is 3.06E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 276.4895s - [COUNTERS] Fortran Overhead ( 0 ) : 22.0996s - [COUNTERS] CudaCpp MEs ( 2 ) : 254.3899s for 90112 events => throughput is 3.54E+02 events/s + [COUNTERS] PROGRAM TOTAL : 318.2044s + [COUNTERS] Fortran Overhead ( 0 ) : 26.8024s + [COUNTERS] CudaCpp MEs ( 2 ) : 291.4019s for 90112 events => throughput is 3.09E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.257001e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.618074e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.254807e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.618894e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 37.6513s - [COUNTERS] Fortran Overhead ( 0 ) : 17.0538s - [COUNTERS] CudaCpp MEs ( 2 ) : 20.5975s for 8192 events => throughput is 3.98E+02 events/s + [COUNTERS] PROGRAM TOTAL : 44.2064s + [COUNTERS] Fortran Overhead ( 0 ) : 20.3467s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.8597s for 8192 events => throughput is 3.43E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 246.2869s - [COUNTERS] Fortran Overhead ( 0 ) : 19.5567s - [COUNTERS] CudaCpp MEs ( 2 ) : 226.7302s for 90112 events => throughput is 3.97E+02 events/s + [COUNTERS] PROGRAM TOTAL : 291.1048s + [COUNTERS] Fortran Overhead ( 0 ) : 24.2318s + [COUNTERS] CudaCpp MEs ( 2 ) : 266.8729s for 90112 events => throughput is 3.38E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.902541e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.097914e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.907357e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.125731e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 21.2887s - [COUNTERS] Fortran Overhead ( 0 ) : 9.9280s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.3607s for 8192 events => throughput is 7.21E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.8566s + [COUNTERS] Fortran Overhead ( 0 ) : 22.2857s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.5710s for 8192 events => throughput is 3.48E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 136.9333s - [COUNTERS] Fortran Overhead ( 0 ) : 12.3995s - [COUNTERS] CudaCpp MEs ( 2 ) : 124.5338s for 90112 events => throughput is 7.24E+02 events/s + [COUNTERS] PROGRAM TOTAL : 285.1342s + [COUNTERS] Fortran Overhead ( 0 ) : 26.2120s + [COUNTERS] CudaCpp MEs ( 2 ) : 258.9222s for 90112 events => throughput is 3.48E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.582235e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.725410e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.580960e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.772387e+02 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 4.2510s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1660s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0850s for 8192 events => throughput is 7.55E+03 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.7763568394002505e-15) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 18.8198s + [COUNTERS] Fortran Overhead ( 0 ) : 6.9183s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9015s for 90112 events => throughput is 7.57E+03 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.7763568394002505e-15) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.527080e+03 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.239391e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.271267e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.600243e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.245889e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.476521e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.229131e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.234312e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 0c152a6098..2b7ca2c190 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_20:57:42 +DATE: 2024-03-01_05:18:49 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 84.8831s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3259s - [COUNTERS] Fortran MEs ( 1 ) : 84.5572s for 8192 events => throughput is 9.69E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.8320s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s + [COUNTERS] Fortran MEs ( 1 ) : 96.3581s for 8192 events => throughput is 8.50E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 85.4196s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3272s - [COUNTERS] Fortran MEs ( 1 ) : 85.0923s for 8192 events => throughput is 9.63E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.1294s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4800s + [COUNTERS] Fortran MEs ( 1 ) : 95.6494s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 934.8129s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8577s - [COUNTERS] Fortran MEs ( 1 ) : 931.9553s for 90112 events => throughput is 9.67E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1058.3011s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1783s + [COUNTERS] Fortran MEs ( 1 ) : 1054.1228s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,21 +126,21 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405719950940886E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405719957040752E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 172.1966s - [COUNTERS] Fortran Overhead ( 0 ) : 79.1644s - [COUNTERS] CudaCpp MEs ( 2 ) : 93.0322s for 8192 events => throughput is 8.81E+01 events/s + [COUNTERS] PROGRAM TOTAL : 197.7089s + [COUNTERS] Fortran Overhead ( 0 ) : 90.3714s + [COUNTERS] CudaCpp MEs ( 2 ) : 107.3375s for 8192 events => throughput is 7.63E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719950940886E-006) differ by less than 4E-4 (0.00013985206930144933) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719957040752E-006) differ by less than 4E-4 (0.00013985256106807675) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -159,35 +159,35 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326290797495657E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326290771198648E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1105.6235s - [COUNTERS] Fortran Overhead ( 0 ) : 81.6815s - [COUNTERS] CudaCpp MEs ( 2 ) : 1023.9420s for 90112 events => throughput is 8.80E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1274.0074s + [COUNTERS] Fortran Overhead ( 0 ) : 94.0944s + [COUNTERS] CudaCpp MEs ( 2 ) : 1179.9131s for 90112 events => throughput is 7.64E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290797495657E-007) differ by less than 4E-4 (0.0001413931234055532) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290771198648E-007) differ by less than 4E-4 (0.00014139199589124907) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.041120e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.108865e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.040723e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.128078e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405717007921116E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 42.2598s - [COUNTERS] Fortran Overhead ( 0 ) : 19.7765s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.4833s for 8192 events => throughput is 3.64E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.6519s + [COUNTERS] Fortran Overhead ( 0 ) : 23.3946s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2573s for 8192 events => throughput is 3.12E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326284900828787E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 269.6254s - [COUNTERS] Fortran Overhead ( 0 ) : 22.2992s - [COUNTERS] CudaCpp MEs ( 2 ) : 247.3262s for 90112 events => throughput is 3.64E+02 events/s + [COUNTERS] PROGRAM TOTAL : 315.8806s + [COUNTERS] Fortran Overhead ( 0 ) : 27.1593s + [COUNTERS] CudaCpp MEs ( 2 ) : 288.7213s for 90112 events => throughput is 3.12E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326284900828787E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.224551e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.581780e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.222429e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.565199e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 21.5804s - [COUNTERS] Fortran Overhead ( 0 ) : 10.0211s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.5593s for 8192 events => throughput is 7.09E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.4788s + [COUNTERS] Fortran Overhead ( 0 ) : 11.8981s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.5807s for 8192 events => throughput is 6.03E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 139.7107s - [COUNTERS] Fortran Overhead ( 0 ) : 12.5414s - [COUNTERS] CudaCpp MEs ( 2 ) : 127.1693s for 90112 events => throughput is 7.09E+02 events/s + [COUNTERS] PROGRAM TOTAL : 165.7549s + [COUNTERS] Fortran Overhead ( 0 ) : 15.4780s + [COUNTERS] CudaCpp MEs ( 2 ) : 150.2769s for 90112 events => throughput is 6.00E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.465406e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.259920e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.459684e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.259066e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 19.0276s - [COUNTERS] Fortran Overhead ( 0 ) : 8.7312s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.2964s for 8192 events => throughput is 7.96E+02 events/s + [COUNTERS] PROGRAM TOTAL : 22.3180s + [COUNTERS] Fortran Overhead ( 0 ) : 10.3786s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9393s for 8192 events => throughput is 6.86E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 124.5091s - [COUNTERS] Fortran Overhead ( 0 ) : 11.2582s - [COUNTERS] CudaCpp MEs ( 2 ) : 113.2509s for 90112 events => throughput is 7.96E+02 events/s + [COUNTERS] PROGRAM TOTAL : 145.4310s + [COUNTERS] Fortran Overhead ( 0 ) : 14.1732s + [COUNTERS] CudaCpp MEs ( 2 ) : 131.2578s for 90112 events => throughput is 6.87E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.781035e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.296906e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.759033e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.301383e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405719306052570E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 10.7428s - [COUNTERS] Fortran Overhead ( 0 ) : 5.1100s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.6329s for 8192 events => throughput is 1.45E+03 events/s + [COUNTERS] PROGRAM TOTAL : 23.0558s + [COUNTERS] Fortran Overhead ( 0 ) : 11.3644s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.6914s for 8192 events => throughput is 7.01E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326283660088769E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 69.5332s - [COUNTERS] Fortran Overhead ( 0 ) : 7.6179s - [COUNTERS] CudaCpp MEs ( 2 ) : 61.9153s for 90112 events => throughput is 1.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 144.1559s + [COUNTERS] Fortran Overhead ( 0 ) : 15.2893s + [COUNTERS] CudaCpp MEs ( 2 ) : 128.8666s for 90112 events => throughput is 6.99E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326283660088769E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.717335e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.554413e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.719770e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.557969e+02 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.241e-06 [1.2405722175509512E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 2.4934s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9950s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4985s for 8192 events => throughput is 1.64E+04 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405722175509512E-006) differ by less than 4E-4 (0.00014003141235829908) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.333e-07 [2.3326296967941821E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 11.1120s + [COUNTERS] Fortran Overhead ( 0 ) : 5.7089s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4031s for 90112 events => throughput is 1.67E+04 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326296967941821E-007) differ by less than 4E-4 (0.0001416576883412901) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.650610e+04 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.632591e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.339184e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.373598e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.323596e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.361104e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.325481e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.425348e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 986ca889e8..99d7cfbcd5 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 + +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z - - -make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_21:53:07 +DATE: 2024-03-01_06:24:34 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 85.2035s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3266s - [COUNTERS] Fortran MEs ( 1 ) : 84.8769s for 8192 events => throughput is 9.65E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.2156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s + [COUNTERS] Fortran MEs ( 1 ) : 95.7357s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 84.8113s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3270s - [COUNTERS] Fortran MEs ( 1 ) : 84.4843s for 8192 events => throughput is 9.70E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.1318s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s + [COUNTERS] Fortran MEs ( 1 ) : 95.6519s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 937.5305s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8561s - [COUNTERS] Fortran MEs ( 1 ) : 934.6744s for 90112 events => throughput is 9.64E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1057.5728s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1537s + [COUNTERS] Fortran MEs ( 1 ) : 1053.4191s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985299359844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 180.7446s - [COUNTERS] Fortran Overhead ( 0 ) : 83.3239s - [COUNTERS] CudaCpp MEs ( 2 ) : 97.4208s for 8192 events => throughput is 8.41E+01 events/s + [COUNTERS] PROGRAM TOTAL : 220.4361s + [COUNTERS] Fortran Overhead ( 0 ) : 102.4490s + [COUNTERS] CudaCpp MEs ( 2 ) : 117.9870s for 8192 events => throughput is 6.94E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1158.9873s - [COUNTERS] Fortran Overhead ( 0 ) : 85.8624s - [COUNTERS] CudaCpp MEs ( 2 ) : 1073.1249s for 90112 events => throughput is 8.40E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1422.8276s + [COUNTERS] Fortran Overhead ( 0 ) : 106.0198s + [COUNTERS] CudaCpp MEs ( 2 ) : 1316.8079s for 90112 events => throughput is 6.84E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993212353001E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.891814e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.035940e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.885638e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.018960e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985295828471E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 90.4857s - [COUNTERS] Fortran Overhead ( 0 ) : 41.7615s - [COUNTERS] CudaCpp MEs ( 2 ) : 48.7242s for 8192 events => throughput is 1.68E+02 events/s + [COUNTERS] PROGRAM TOTAL : 110.5022s + [COUNTERS] Fortran Overhead ( 0 ) : 50.8167s + [COUNTERS] CudaCpp MEs ( 2 ) : 59.6855s for 8192 events => throughput is 1.37E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222645653E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 580.2521s - [COUNTERS] Fortran Overhead ( 0 ) : 44.3181s - [COUNTERS] CudaCpp MEs ( 2 ) : 535.9340s for 90112 events => throughput is 1.68E+02 events/s + [COUNTERS] PROGRAM TOTAL : 715.3882s + [COUNTERS] Fortran Overhead ( 0 ) : 54.5501s + [COUNTERS] CudaCpp MEs ( 2 ) : 660.8381s for 90112 events => throughput is 1.36E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222645653E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.980238e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.628879e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.979425e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.636164e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 40.8990s - [COUNTERS] Fortran Overhead ( 0 ) : 18.5391s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.3599s for 8192 events => throughput is 3.66E+02 events/s + [COUNTERS] PROGRAM TOTAL : 48.5744s + [COUNTERS] Fortran Overhead ( 0 ) : 22.1801s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.3943s for 8192 events => throughput is 3.10E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 267.2159s - [COUNTERS] Fortran Overhead ( 0 ) : 21.0817s - [COUNTERS] CudaCpp MEs ( 2 ) : 246.1343s for 90112 events => throughput is 3.66E+02 events/s + [COUNTERS] PROGRAM TOTAL : 319.2663s + [COUNTERS] Fortran Overhead ( 0 ) : 26.0078s + [COUNTERS] CudaCpp MEs ( 2 ) : 293.2585s for 90112 events => throughput is 3.07E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.500304e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.764546e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.493152e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.773101e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 36.1935s - [COUNTERS] Fortran Overhead ( 0 ) : 16.2965s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.8969s for 8192 events => throughput is 4.12E+02 events/s + [COUNTERS] PROGRAM TOTAL : 42.4540s + [COUNTERS] Fortran Overhead ( 0 ) : 19.2743s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.1797s for 8192 events => throughput is 3.53E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 237.7664s - [COUNTERS] Fortran Overhead ( 0 ) : 18.8191s - [COUNTERS] CudaCpp MEs ( 2 ) : 218.9473s for 90112 events => throughput is 4.12E+02 events/s + [COUNTERS] PROGRAM TOTAL : 277.3470s + [COUNTERS] Fortran Overhead ( 0 ) : 22.9193s + [COUNTERS] CudaCpp MEs ( 2 ) : 254.4277s for 90112 events => throughput is 3.54E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.129109e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.384820e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.122337e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.391539e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 20.7508s - [COUNTERS] Fortran Overhead ( 0 ) : 9.5720s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.1788s for 8192 events => throughput is 7.33E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.2143s + [COUNTERS] Fortran Overhead ( 0 ) : 21.9553s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.2589s for 8192 events => throughput is 3.52E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 133.3425s - [COUNTERS] Fortran Overhead ( 0 ) : 12.0919s - [COUNTERS] CudaCpp MEs ( 2 ) : 121.2506s for 90112 events => throughput is 7.43E+02 events/s + [COUNTERS] PROGRAM TOTAL : 278.0679s + [COUNTERS] Fortran Overhead ( 0 ) : 25.4000s + [COUNTERS] CudaCpp MEs ( 2 ) : 252.6680s for 90112 events => throughput is 3.57E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.875881e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.828727e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.878334e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.858416e+02 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 3.5884s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7239s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8645s for 8192 events => throughput is 9.48E+03 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985217419736E-006) differ by less than 2E-4 (8.480691704448873e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993078576733E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 15.9902s + [COUNTERS] Fortran Overhead ( 0 ) : 6.4881s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.5020s for 90112 events => throughput is 9.48E+03 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993078576733E-007) differ by less than 2E-4 (3.464063480507207e-10) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.411937e+03 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.083264e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.112113e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.161038e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.111465e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.105445e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.112837e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.656493e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index b84371ad1d..8e9ad5ba7a 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_19:46:46 +DATE: 2024-03-01_03:51:32 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3129s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2585s - [COUNTERS] Fortran MEs ( 1 ) : 0.0543s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4944s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s + [COUNTERS] Fortran MEs ( 1 ) : 0.0697s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2739s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2195s - [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3864s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3169s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6236s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0286s - [COUNTERS] Fortran MEs ( 1 ) : 0.5950s for 90112 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2522s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4897s + [COUNTERS] Fortran MEs ( 1 ) : 0.7625s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3360s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2765s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0595s for 8192 events => throughput is 1.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4681s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3922s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0759s for 8192 events => throughput is 1.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561293] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7527s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0953s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6574s for 90112 events => throughput is 1.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3698s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5419s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8279s for 90112 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561293) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.383515e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.084897e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.408034e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.103096e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351262530] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2799s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2477s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0322s for 8192 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4004s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3592s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561281] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4221s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0684s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3537s for 90112 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9658s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5172s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4486s for 90112 events => throughput is 2.01E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561281) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.533732e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.019219e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.596028e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.018294e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2515s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2340s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0175s for 8192 events => throughput is 4.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3643s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3408s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.49E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2454s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0534s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1920s for 90112 events => throughput is 4.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7585s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4994s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2591s for 90112 events => throughput is 3.48E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.786503e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.297018e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.821003e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.427747e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2476s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2309s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0167s for 8192 events => throughput is 4.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3623s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0225s for 8192 events => throughput is 3.65E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2377s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0550s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1827s for 90112 events => throughput is 4.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8132s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5645s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2486s for 90112 events => throughput is 3.62E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.024302e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.905513e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.056314e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.866043e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2452s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2307s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0145s for 8192 events => throughput is 5.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3815s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3495s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.56E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2125s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0534s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1591s for 90112 events => throughput is 5.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8893s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5364s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3529s for 90112 events => throughput is 2.55E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.561724e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.640953e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.701897e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.543334e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539351263363] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7465s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7458s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263363) differ by less than 3E-14 (1.3322676295501878e-15) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686556561304] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.9068s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8993s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.20E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561304) differ by less than 3E-14 (4.440892098500626e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.589846e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.058801e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.383441e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.512285e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.382616e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.771039e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.376307e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.776386e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index f8f26accf9..63166c80e0 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=512y - -make USEBUILDDIR=1 AVX=512z +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make USEBUILDDIR=1 AVX=512y + +make USEBUILDDIR=1 AVX=512z +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_19:46:58 +DATE: 2024-03-01_03:52:02 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3136s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2586s - [COUNTERS] Fortran MEs ( 1 ) : 0.0550s for 8192 events => throughput is 1.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4536s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3837s + [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2674s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2131s - [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3210s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6288s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0330s - [COUNTERS] Fortran MEs ( 1 ) : 0.5958s for 90112 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2714s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5019s + [COUNTERS] Fortran MEs ( 1 ) : 0.7695s for 90112 events => throughput is 1.17E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110463093540638] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3315s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2765s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0550s for 8192 events => throughput is 1.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3882s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0704s for 8192 events => throughput is 1.16E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686273216112] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6971s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0917s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6054s for 90112 events => throughput is 1.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3150s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5373s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7777s for 90112 events => throughput is 1.16E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686273216112) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.519656e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.170698e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.508595e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.161745e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110459152958460] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2550s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2350s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0200s for 8192 events => throughput is 4.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3657s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3405s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0252s for 8192 events => throughput is 3.25E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510683016166510] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2758s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0569s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2188s for 90112 events => throughput is 4.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7697s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4943s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2753s for 90112 events => throughput is 3.27E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510683016166510) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.011797e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.219045e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.085942e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.229652e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2343s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2249s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3421s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3299s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.71E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.1488s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0448s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1040s for 90112 events => throughput is 8.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6208s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4844s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1364s for 90112 events => throughput is 6.61E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.884332e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.431027e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.982873e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.412727e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2332s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2243s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0090s for 8192 events => throughput is 9.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3415s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3300s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0116s for 8192 events => throughput is 7.08E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.1470s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0443s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1027s for 90112 events => throughput is 8.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6084s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1253s for 90112 events => throughput is 7.19E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.431807e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.891581e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.514860e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.928440e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110464176080312] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2312s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2239s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3483s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3329s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.33E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510685411522326] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.1246s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0440s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0806s for 90112 events => throughput is 1.12E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.6561s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4840s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1721s for 90112 events => throughput is 5.24E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510685411522326) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.130278e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.988554e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.145791e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.962392e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110478167944563] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7423s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7418s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110478167944563) differ by less than 4E-4 (2.2568093527297606e-06) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510689885789414] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8968s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8910s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.53E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510689885789414) differ by less than 4E-4 (1.547708907700951e-07) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.824058e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.473484e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.891145e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.706092e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.798334e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.787777e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.356687e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.028611e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index a0472a3076..eb4ca92d13 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=512y - +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_19:47:09 +DATE: 2024-03-01_03:52:30 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3127s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2578s - [COUNTERS] Fortran MEs ( 1 ) : 0.0549s for 8192 events => throughput is 1.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4522s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3828s + [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2697s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2153s - [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3858s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3164s + [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6293s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0334s - [COUNTERS] Fortran MEs ( 1 ) : 0.5959s for 90112 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2499s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4895s + [COUNTERS] Fortran MEs ( 1 ) : 0.7604s for 90112 events => throughput is 1.19E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3346s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2749s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0596s for 8192 events => throughput is 1.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4694s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3943s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7529s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0996s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6533s for 90112 events => throughput is 1.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4601s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6015s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8586s for 90112 events => throughput is 1.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794337) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.399420e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.100770e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.416273e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.090853e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2792s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2479s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0314s for 8192 events => throughput is 2.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3940s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0394s for 8192 events => throughput is 2.08E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4119s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3440s for 90112 events => throughput is 2.62E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9359s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5057s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4302s for 90112 events => throughput is 2.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794334) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.596110e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.020468e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.619319e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.027641e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2501s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2330s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3636s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3406s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.56E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2416s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0536s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1880s for 90112 events => throughput is 4.79E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7468s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2545s for 90112 events => throughput is 3.54E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.893459e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.536848e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.921901e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.536744e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2482s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2320s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0162s for 8192 events => throughput is 5.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3573s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3372s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0201s for 8192 events => throughput is 4.08E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2303s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0527s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1776s for 90112 events => throughput is 5.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7304s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5047s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2257s for 90112 events => throughput is 3.99E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.182586e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.887668e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.219883e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.834847e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2489s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2335s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.34E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4046s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3689s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 8192 events => throughput is 2.30E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2187s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0503s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1684s for 90112 events => throughput is 5.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9542s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5763s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3779s for 90112 events => throughput is 2.38E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.247590e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.510568e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.440729e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.320811e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539343558537] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7473s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7466s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539343558537) differ by less than 2E-4 (2.8419910869104115e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8944s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8868s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.19E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686553631395) differ by less than 2E-4 (1.3620671257541517e-10) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.579519e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.134868e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.391789e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.511629e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.394001e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.800973e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.396936e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.776316e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index ed28df1cad..baa8c044cd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:08:08 +DATE: 2024-03-01_02:23:52 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.465816e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.330908e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.240172e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.907657 sec + 2,864,594,511 cycles # 3.017 GHz + 4,419,491,827 instructions # 1.54 insn per cycle + 1.243823060 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.425993e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.645967e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.645967e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.035526 sec - 17,606,144,392 cycles # 3.495 GHz - 44,071,163,755 instructions # 2.50 insn per cycle - 5.038823878 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.117981e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.310106e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.310106e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.029383 sec + 18,345,746,310 cycles # 3.041 GHz + 43,971,705,846 instructions # 2.40 insn per cycle + 6.038464488 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.250731e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.893394e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.893394e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.401156 sec - 11,895,409,777 cycles # 3.495 GHz - 30,996,596,899 instructions # 2.61 insn per cycle - 3.404468118 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.673850e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.186329e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.186329e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.154865 sec + 12,823,382,487 cycles # 3.082 GHz + 30,998,172,347 instructions # 2.42 insn per cycle + 4.171623433 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.710645e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.676510e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.676510e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.897930 sec - 9,801,226,041 cycles # 3.380 GHz - 19,263,940,378 instructions # 1.97 insn per cycle - 2.901252745 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.086690e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.914110e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.914110e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.406763 sec + 10,081,289,557 cycles # 2.955 GHz + 19,366,111,959 instructions # 1.92 insn per cycle + 3.427414790 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.787050e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.808409e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.808409e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.849886 sec - 9,645,321,314 cycles # 3.382 GHz - 18,674,064,614 instructions # 1.94 insn per cycle - 2.853150797 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.191873e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.083636e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.083636e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.257696 sec + 9,685,682,355 cycles # 2.968 GHz + 18,976,171,527 instructions # 1.96 insn per cycle + 3.273948471 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.516175e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.268017e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.268017e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.378272 sec - 8,078,122,389 cycles # 3.393 GHz - 15,420,673,885 instructions # 1.91 insn per cycle - 2.381601605 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.805262e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.408203e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.408203e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.888242 sec + 8,621,851,062 cycles # 2.214 GHz + 15,727,334,662 instructions # 1.82 insn per cycle + 3.905958468 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 062460fe0a..b9ff72dbf3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,219 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:28:58 +DATE: 2024-03-01_03:12:58 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.687342e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.551417e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.551417e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.232505 sec + 7,524,955,995 cycles # 3.041 GHz + 13,468,669,108 instructions # 1.79 insn per cycle + 2.532807464 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.384713e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.591202e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.591202e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.251037 sec - 18,383,972,346 cycles # 3.499 GHz - 44,316,778,142 instructions # 2.41 insn per cycle - 5.255195321 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.081573e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.260544e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.260544e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.415532 sec + 19,561,606,037 cycles # 3.046 GHz + 44,198,639,919 instructions # 2.26 insn per cycle + 6.422457347 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.136391e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.710309e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.710309e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.710222 sec - 12,989,118,477 cycles # 3.498 GHz - 31,817,590,346 instructions # 2.45 insn per cycle - 3.714348648 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.552230e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.996603e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.996603e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.664054 sec + 13,997,557,946 cycles # 2.998 GHz + 31,841,279,233 instructions # 2.27 insn per cycle + 4.670791737 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.564211e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.413599e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.413599e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.183000 sec - 10,807,008,900 cycles # 3.392 GHz - 20,602,119,150 instructions # 1.91 insn per cycle - 3.187089748 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.951455e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.660973e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.660973e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.823801 sec + 11,324,833,068 cycles # 2.957 GHz + 20,724,775,427 instructions # 1.83 insn per cycle + 3.830534322 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.621053e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.507982e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.507982e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.143575 sec - 10,677,287,374 cycles # 3.394 GHz - 20,011,376,117 instructions # 1.87 insn per cycle - 3.147703877 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.028218e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.792747e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.792747e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.704930 sec + 10,963,593,820 cycles # 2.954 GHz + 20,347,072,159 instructions # 1.86 insn per cycle + 3.711957869 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.247983e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.684393e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.684393e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.688755 sec - 9,174,309,622 cycles # 3.408 GHz - 16,543,784,332 instructions # 1.80 insn per cycle - 2.692848189 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.747913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.283053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.283053e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.214412 sec + 9,956,996,891 cycles # 2.360 GHz + 16,873,658,319 instructions # 1.69 insn per cycle + 4.221168968 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index bec395b2fc..09aaad1dd8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:34:42 +DATE: 2024-03-01_03:26:09 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.492636e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.583078e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.097014e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 1.329039 sec + 4,626,136,964 cycles # 2.966 GHz + 7,229,705,832 instructions # 1.56 insn per cycle + 1.616136536 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.400069e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.612023e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.612023e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.120496e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.314160e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.314160e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.119023 sec - 17,915,758,115 cycles # 3.498 GHz - 44,072,070,040 instructions # 2.46 insn per cycle - 5.122383130 seconds time elapsed +TOTAL : 6.368910 sec + 19,436,039,687 cycles # 3.050 GHz + 44,075,637,403 instructions # 2.27 insn per cycle + 6.374367735 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.251714e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.897562e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.897562e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.684337e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204179e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204179e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.399887 sec - 11,900,907,245 cycles # 3.498 GHz - 30,996,702,046 instructions # 2.60 insn per cycle - 3.403003584 seconds time elapsed +TOTAL : 4.477126 sec + 13,840,650,655 cycles # 3.088 GHz + 31,000,398,658 instructions # 2.24 insn per cycle + 4.482579907 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.718441e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.683164e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.683164e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.074274e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.910197e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.910197e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.889969 sec - 9,778,075,616 cycles # 3.381 GHz - 19,263,677,928 instructions # 1.97 insn per cycle - 2.893107114 seconds time elapsed +TOTAL : 3.779571 sec + 11,221,356,305 cycles # 2.967 GHz + 19,268,573,834 instructions # 1.72 insn per cycle + 3.784933241 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.796994e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.822794e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.822794e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.174998e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.082449e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.082449e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.839931 sec - 9,615,587,176 cycles # 3.383 GHz - 18,673,582,472 instructions # 1.94 insn per cycle - 2.843100077 seconds time elapsed +TOTAL : 3.643336 sec + 10,818,026,445 cycles # 2.966 GHz + 18,676,470,141 instructions # 1.73 insn per cycle + 3.648853496 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.505324e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.277259e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.277259e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.875863e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.507498e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.507498e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.384419 sec - 8,111,960,646 cycles # 3.399 GHz - 15,421,322,877 instructions # 1.90 insn per cycle - 2.387622919 seconds time elapsed +TOTAL : 4.111357 sec + 9,725,602,646 cycles # 2.364 GHz + 15,429,502,829 instructions # 1.59 insn per cycle + 4.116843302 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 9bceb91dab..c5fdf6f3c6 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,138 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:34:05 +DATE: 2024-03-01_03:22:56 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.511929e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.606834e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.132028e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.970663 sec + 3,681,129,197 cycles # 3.043 GHz + 7,185,953,404 instructions # 1.95 insn per cycle + 1.266725293 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe: Aborted - 4,565,469 cycles # 3.250 GHz - 6,288,877 instructions # 1.38 insn per cycle - 0.042167323 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.129015e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.325606e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325606e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.971935 sec + 18,327,370,852 cycles # 3.067 GHz + 43,971,442,751 instructions # 2.40 insn per cycle + 5.977352348 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe: Aborted - 4,579,291 cycles # 2.729 GHz - 6,314,135 instructions # 1.38 insn per cycle - 0.039024734 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.658250e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.168305e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.168305e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.196892 sec + 12,732,971,160 cycles # 3.031 GHz + 30,998,026,084 instructions # 2.43 insn per cycle + 4.202372987 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe: Aborted - 4,579,990 cycles # 2.736 GHz - 6,320,509 instructions # 1.38 insn per cycle - 0.038108900 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.058430e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.883101e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.883101e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.446922 sec + 10,145,804,321 cycles # 2.940 GHz + 19,366,948,979 instructions # 1.91 insn per cycle + 3.452452971 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe: Aborted - 4,556,853 cycles # 2.681 GHz - 6,314,123 instructions # 1.39 insn per cycle - 0.038394657 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.138596e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.023243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.023243e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.339653 sec + 9,693,126,342 cycles # 2.898 GHz + 18,976,550,822 instructions # 1.96 insn per cycle + 3.345442131 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe: Aborted - 4,605,984 cycles # 2.673 GHz - 6,322,733 instructions # 1.37 insn per cycle - 0.038325628 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.879529e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.506982e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.506982e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.741561 sec + 8,595,853,951 cycles # 2.295 GHz + 15,727,211,339 instructions # 1.83 insn per cycle + 3.747065146 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index dd5c05e2b0..4a4acadae4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,183 +1,208 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:32:16 +DATE: 2024-03-01_03:19:38 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.223584e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.552038e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.038459e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 1.841184 sec + 6,281,268,865 cycles # 3.032 GHz + 11,616,541,551 instructions # 1.85 insn per cycle + 2.127335919 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.409868e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.625204e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.625204e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.085348 sec - 17,784,058,163 cycles # 3.496 GHz - 44,072,184,676 instructions # 2.48 insn per cycle - 5.088440281 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.136861e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.332827e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.332827e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.931254 sec + 18,320,874,631 cycles # 3.087 GHz + 43,971,483,251 instructions # 2.40 insn per cycle + 5.936943481 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.251897e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.896740e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.896740e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.399875 sec - 11,894,015,527 cycles # 3.496 GHz - 30,996,681,554 instructions # 2.61 insn per cycle - 3.403119350 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.678735e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.191487e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.191487e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.142725 sec + 12,747,370,194 cycles # 3.074 GHz + 30,997,666,885 instructions # 2.43 insn per cycle + 4.148307465 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.713038e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.676668e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.676668e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.893827 sec - 9,788,402,080 cycles # 3.380 GHz - 19,263,518,574 instructions # 1.97 insn per cycle - 2.897020495 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.080045e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.910176e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.910176e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.411600 sec + 10,085,079,136 cycles # 2.953 GHz + 19,364,558,625 instructions # 1.92 insn per cycle + 3.417084709 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.802736e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.835064e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.835064e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.835699 sec - 9,598,762,677 cycles # 3.382 GHz - 18,673,739,851 instructions # 1.95 insn per cycle - 2.838932444 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.138969e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.032835e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.032835e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.338836 sec + 9,731,023,917 cycles # 2.911 GHz + 18,988,816,377 instructions # 1.95 insn per cycle + 3.344328310 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.529044e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.287257e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.287257e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.372119 sec - 8,060,639,797 cycles # 3.395 GHz - 15,420,889,940 instructions # 1.91 insn per cycle - 2.375266541 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.865281e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.489559e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.489559e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.766791 sec + 8,586,243,314 cycles # 2.277 GHz + 15,726,194,960 instructions # 1.83 insn per cycle + 3.772300478 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index c765405708..acaec4a100 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:08:29 +DATE: 2024-03-01_02:24:28 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.477749e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.322801e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.215924e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.699180 sec + 2,815,032,547 cycles # 3.020 GHz + 4,411,732,319 instructions # 1.57 insn per cycle + 1.012826906 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.526914e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.781748e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.781748e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.743355 sec - 16,588,877,180 cycles # 3.495 GHz - 41,918,002,874 instructions # 2.53 insn per cycle - 4.746678875 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.177941e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.396494e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.396494e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.744811 sec + 17,454,360,700 cycles # 3.039 GHz + 41,822,159,126 instructions # 2.40 insn per cycle + 5.754685240 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.291295e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.968808e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.968808e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.352770 sec - 11,735,836,966 cycles # 3.498 GHz - 30,158,811,446 instructions # 2.57 insn per cycle - 3.356116759 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.724349e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.269291e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.269291e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.046627 sec + 12,493,235,601 cycles # 3.083 GHz + 30,160,547,265 instructions # 2.41 insn per cycle + 4.067076512 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.762119e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.774937e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.774937e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.859880 sec - 9,680,972,149 cycles # 3.382 GHz - 18,995,343,677 instructions # 1.96 insn per cycle - 2.863281821 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.121345e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.968992e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.968992e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.357760 sec + 9,927,136,910 cycles # 2.952 GHz + 19,096,793,241 instructions # 1.92 insn per cycle + 3.375474470 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.833992e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.898752e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.898752e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.816063 sec - 9,537,877,818 cycles # 3.384 GHz - 18,442,918,066 instructions # 1.93 insn per cycle - 2.819507820 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.204942e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.126738e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.126738e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.243150 sec + 9,616,213,299 cycles # 2.960 GHz + 18,757,748,925 instructions # 1.95 insn per cycle + 3.265371118 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.539315e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.305642e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.305642e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.365877 sec - 8,037,965,784 cycles # 3.394 GHz - 15,297,089,513 instructions # 1.90 insn per cycle - 2.369192143 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.914682e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.579340e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.579340e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.680994 sec + 8,464,459,891 cycles # 2.296 GHz + 15,603,182,673 instructions # 1.84 insn per cycle + 3.700542167 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index a63eab528c..5e36a6ad1c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:22:33 +DATE: 2024-03-01_03:02:07 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.482201e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.589772e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.144008e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.677531 sec + 2,738,360,567 cycles # 3.010 GHz + 4,202,554,319 instructions # 1.53 insn per cycle + 0.971727419 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.166985e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.720632e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.720632e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.530132 sec - 12,355,611,055 cycles # 3.498 GHz - 32,613,744,062 instructions # 2.64 insn per cycle - 3.533621370 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.697362e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.176157e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.176157e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.107132 sec + 12,669,493,888 cycles # 3.081 GHz + 32,513,570,576 instructions # 2.57 insn per cycle + 4.112837024 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.915578e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.095121e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.095121e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.764375 sec - 9,674,764,507 cycles # 3.497 GHz - 24,473,407,704 instructions # 2.53 insn per cycle - 2.767795138 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.109105e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.012747e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.012747e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.385880 sec + 10,259,128,837 cycles # 3.025 GHz + 24,473,597,991 instructions # 2.39 insn per cycle + 3.391687112 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.020535e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.265971e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.265971e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.659102 sec - 9,008,630,968 cycles # 3.385 GHz - 16,821,530,421 instructions # 1.87 insn per cycle - 2.662558205 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.263099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.319180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.319180e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.179158 sec + 9,139,183,085 cycles # 2.870 GHz + 16,922,980,195 instructions # 1.85 insn per cycle + 3.185130704 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.305206e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.841692e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.841692e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.493754 sec - 8,724,117,803 cycles # 3.495 GHz - 16,041,038,377 instructions # 1.84 insn per cycle - 2.497150137 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.177097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.324804e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.324804e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.299126 sec + 9,225,486,663 cycles # 2.804 GHz + 16,350,529,622 instructions # 1.77 insn per cycle + 3.305119215 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.721147e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.731450e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.731450e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.278737 sec - 7,752,022,969 cycles # 3.398 GHz - 14,276,736,178 instructions # 1.84 insn per cycle - 2.282116900 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.061533e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.856351e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.856351e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.449960 sec + 7,914,148,444 cycles # 2.292 GHz + 14,582,993,732 instructions # 1.84 insn per cycle + 3.455623027 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 1680b26f09..640cde8efe 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:22:52 +DATE: 2024-03-01_03:02:37 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.480008e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.624168e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.202092e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.676373 sec + 2,668,503,996 cycles # 2.929 GHz + 4,153,523,497 instructions # 1.56 insn per cycle + 0.971892133 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.961846e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.105876e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.105876e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.753891 sec - 9,636,283,642 cycles # 3.496 GHz - 25,494,464,469 instructions # 2.65 insn per cycle - 2.757264718 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.254295e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.186891e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186891e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.188433 sec + 9,833,021,244 cycles # 3.080 GHz + 25,393,539,961 instructions # 2.58 insn per cycle + 3.194101979 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.533991e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.440258e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.440258e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.386937 sec - 8,350,886,622 cycles # 3.495 GHz - 21,482,443,122 instructions # 2.57 insn per cycle - 2.390265855 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.515638e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.869932e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.869932e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.899703 sec + 8,920,893,128 cycles # 3.072 GHz + 21,482,466,118 instructions # 2.41 insn per cycle + 2.905533602 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.414671e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.103227e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.103227e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.417911 sec - 8,459,925,330 cycles # 3.495 GHz - 15,709,703,737 instructions # 1.86 insn per cycle - 2.421288962 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.523191e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.858970e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.858970e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.882396 sec + 8,595,793,495 cycles # 2.978 GHz + 15,810,706,009 instructions # 1.84 insn per cycle + 2.888136564 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.582488e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.453415e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.453415e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.347054 sec - 8,209,972,658 cycles # 3.494 GHz - 15,201,153,143 instructions # 1.85 insn per cycle - 2.350350106 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.508044e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.828642e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.828642e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.905551 sec + 8,435,887,633 cycles # 2.898 GHz + 15,503,428,881 instructions # 1.84 insn per cycle + 2.911395780 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.888127e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.125675e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.125675e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.206515 sec - 7,514,566,109 cycles # 3.402 GHz - 13,978,476,813 instructions # 1.86 insn per cycle - 2.209835018 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.236518e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.188285e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.188285e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.208349 sec + 7,562,205,797 cycles # 2.353 GHz + 14,282,233,625 instructions # 1.89 insn per cycle + 3.214128577 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index e23690073e..4388b968c1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:08:50 +DATE: 2024-03-01_02:25:01 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.096246e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.080730e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.278086e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.584592 sec + 2,424,873,450 cycles # 2.992 GHz + 3,757,113,510 instructions # 1.55 insn per cycle + 0.891497126 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.579769e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.893874e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.893874e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.588717 sec - 16,042,603,900 cycles # 3.494 GHz - 43,689,952,449 instructions # 2.72 insn per cycle - 4.591858539 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.144766e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.356973e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.356973e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.861200 sec + 17,835,681,737 cycles # 3.040 GHz + 43,512,863,183 instructions # 2.44 insn per cycle + 5.870178360 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.164150e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.789236e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.789236e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.577598 sec - 9,012,166,111 cycles # 3.494 GHz - 21,985,206,002 instructions # 2.44 insn per cycle - 2.580747570 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.374028e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.640654e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.640654e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.010180 sec + 9,264,818,102 cycles # 3.072 GHz + 21,907,230,972 instructions # 2.36 insn per cycle + 3.030108679 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.413033e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.053155e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.053155e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.401302 sec - 8,147,553,270 cycles # 3.390 GHz - 15,499,004,047 instructions # 1.90 insn per cycle - 2.404474906 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.583102e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.970498e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.970498e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.786671 sec + 8,293,439,755 cycles # 2.970 GHz + 15,591,050,714 instructions # 1.88 insn per cycle + 2.803351674 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.441969e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.116891e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.116891e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.399786 sec - 8,145,366,561 cycles # 3.391 GHz - 15,141,676,425 instructions # 1.86 insn per cycle - 2.402973985 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.519812e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.882018e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.882018e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.857922 sec + 8,240,284,445 cycles # 2.878 GHz + 15,434,807,288 instructions # 1.87 insn per cycle + 2.873134335 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.532625e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.114438e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.114438e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.959359 sec - 6,688,058,045 cycles # 3.409 GHz - 12,568,020,375 instructions # 1.88 insn per cycle - 1.962540189 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.640401e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.080150e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.080150e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.738177 sec + 6,634,758,903 cycles # 2.418 GHz + 12,863,535,626 instructions # 1.94 insn per cycle + 2.752418443 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 09e54302f1..5ebf98d844 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,219 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:29:20 +DATE: 2024-03-01_03:13:35 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.291092e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.500878e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.500878e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.664885 sec + 5,743,008,286 cycles # 3.032 GHz + 10,353,112,228 instructions # 1.80 insn per cycle + 1.950710268 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.560582e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.868731e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.868731e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.680582 sec - 16,368,713,672 cycles # 3.495 GHz - 43,840,413,084 instructions # 2.68 insn per cycle - 4.684353022 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.118079e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.318846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318846e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.094512 sec + 18,492,834,117 cycles # 3.035 GHz + 43,665,828,462 instructions # 2.36 insn per cycle + 6.100764200 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.038598e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.508158e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.508158e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.758988 sec - 9,660,848,077 cycles # 3.498 GHz - 23,320,516,277 instructions # 2.41 insn per cycle - 2.762808644 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.278046e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.410824e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.410824e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.242674 sec + 9,984,073,322 cycles # 3.074 GHz + 23,241,211,318 instructions # 2.33 insn per cycle + 3.248988906 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.263246e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.739456e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.739456e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.576121 sec - 8,764,669,029 cycles # 3.398 GHz - 16,619,915,020 instructions # 1.90 insn per cycle - 2.579987789 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.460715e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.687913e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.687913e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.031931 sec + 9,018,287,343 cycles # 2.969 GHz + 16,710,480,351 instructions # 1.85 insn per cycle + 3.038355322 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.291714e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.793688e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.793688e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.573938 sec - 8,759,700,956 cycles # 3.399 GHz - 16,262,157,562 instructions # 1.86 insn per cycle - 2.577692084 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.487042e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742069e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742069e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.003313 sec + 8,924,279,581 cycles # 2.966 GHz + 16,553,851,203 instructions # 1.85 insn per cycle + 3.009721457 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.243100e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.227413e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.227413e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.161827 sec - 7,403,968,795 cycles # 3.420 GHz - 13,776,638,645 instructions # 1.86 insn per cycle - 2.165624801 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.456097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.675362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.675362e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 3.047824 sec + 7,411,564,908 cycles # 2.428 GHz + 14,070,800,087 instructions # 1.90 insn per cycle + 3.054259465 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 22febd8bf2..57f3a9eb6a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:35:03 +DATE: 2024-03-01_03:26:45 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.305418e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.176873e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.254170e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 +TOTAL : 1.176348 sec + 4,160,459,328 cycles # 2.977 GHz + 6,608,736,714 instructions # 1.59 insn per cycle + 1.454481545 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.580543e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.894945e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.894945e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.163258e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.379965e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.379965e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.585144 sec - 16,033,044,830 cycles # 3.495 GHz - 43,690,102,128 instructions # 2.73 insn per cycle - 4.588130273 seconds time elapsed +TOTAL : 6.084905 sec + 18,848,150,042 cycles # 3.095 GHz + 43,694,410,467 instructions # 2.32 insn per cycle + 6.090122961 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.165795e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.795351e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.795351e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.362188e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.607795e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.607795e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.575915 sec - 9,005,888,202 cycles # 3.493 GHz - 21,985,016,188 instructions # 2.44 insn per cycle - 2.578895980 seconds time elapsed +TOTAL : 3.340145 sec + 10,237,006,523 cycles # 3.061 GHz + 21,987,992,116 instructions # 2.15 insn per cycle + 3.345494687 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.416776e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.062032e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.062032e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.557177e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.937995e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.937995e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.398556 sec - 8,138,328,552 cycles # 3.390 GHz - 15,499,199,008 instructions # 1.90 insn per cycle - 2.401542629 seconds time elapsed +TOTAL : 3.130033 sec + 9,276,164,079 cycles # 2.959 GHz + 15,501,530,354 instructions # 1.67 insn per cycle + 3.135291294 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.447614e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.125962e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.125962e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.607828e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022471e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.022471e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.395743 sec - 8,131,915,937 cycles # 3.391 GHz - 15,135,872,991 instructions # 1.86 insn per cycle - 2.398760307 seconds time elapsed +TOTAL : 3.090209 sec + 9,218,829,691 cycles # 2.980 GHz + 15,143,949,757 instructions # 1.64 insn per cycle + 3.095551418 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.538033e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.123539e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.123539e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.625698e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.049871e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.049871e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.958851 sec - 6,688,513,152 cycles # 3.411 GHz - 12,568,494,145 instructions # 1.88 insn per cycle - 1.961860292 seconds time elapsed +TOTAL : 3.081111 sec + 7,633,670,846 cycles # 2.474 GHz + 12,572,894,419 instructions # 1.65 insn per cycle + 3.086406325 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index fc344cccba..72f866059b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,138 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:34:09 +DATE: 2024-03-01_03:23:30 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.312185e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188856e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.293387e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.840959 sec + 3,233,651,545 cycles # 3.031 GHz + 6,593,293,750 instructions # 2.04 insn per cycle + 1.123835132 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe: Aborted - 4,566,934 cycles # 3.256 GHz - 6,299,440 instructions # 1.38 insn per cycle - 0.038188510 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.165423e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.380976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.380976e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.753852 sec + 17,814,734,742 cycles # 3.094 GHz + 43,512,567,450 instructions # 2.44 insn per cycle + 5.759197636 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe: Aborted - 4,511,989 cycles # 2.673 GHz - 6,385,849 instructions # 1.42 insn per cycle - 0.039615964 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.367425e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.644557e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.644557e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.015821 sec + 9,302,641,553 cycles # 3.081 GHz + 21,907,397,717 instructions # 2.35 insn per cycle + 3.021054890 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe: Aborted - 4,505,776 cycles # 2.678 GHz - 6,324,297 instructions # 1.40 insn per cycle - 0.039078577 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.605570e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.994881e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.994881e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.763364 sec + 8,259,626,138 cycles # 2.984 GHz + 15,589,955,941 instructions # 1.89 insn per cycle + 2.768827600 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe: Aborted - 4,566,500 cycles # 3.261 GHz - 6,322,428 instructions # 1.38 insn per cycle - 0.037934624 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.581356e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.971929e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.971929e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.794808 sec + 8,189,932,997 cycles # 2.926 GHz + 15,434,468,382 instructions # 1.88 insn per cycle + 2.800117026 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe: Aborted - 4,587,833 cycles # 3.231 GHz - 6,330,442 instructions # 1.38 insn per cycle - 0.038651242 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.644746e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.098711e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.098711e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.731125 sec + 6,642,886,027 cycles # 2.429 GHz + 12,862,690,732 instructions # 1.94 insn per cycle + 2.736362886 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 49d434f85c..8d8716bc9a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,183 +1,208 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:32:37 +DATE: 2024-03-01_03:20:14 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.282885e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.142631e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.141870e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.452761 sec + 5,067,036,613 cycles # 3.030 GHz + 9,262,361,364 instructions # 1.83 insn per cycle + 1.731002061 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.581672e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.896680e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.896680e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.582644 sec - 16,029,325,021 cycles # 3.496 GHz - 43,690,424,802 instructions # 2.73 insn per cycle - 4.585651317 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.160324e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.375621e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.375621e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.780149 sec + 17,815,433,670 cycles # 3.080 GHz + 43,511,102,764 instructions # 2.44 insn per cycle + 5.785180938 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.149026e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.792646e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.792646e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.584919 sec - 9,045,554,644 cycles # 3.496 GHz - 21,985,410,240 instructions # 2.43 insn per cycle - 2.587906690 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.389771e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.650423e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.650423e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.992624 sec + 9,227,327,267 cycles # 3.079 GHz + 21,906,426,544 instructions # 2.37 insn per cycle + 2.997895192 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.414111e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.052877e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.052877e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.399307 sec - 8,139,048,725 cycles # 3.389 GHz - 15,499,397,322 instructions # 1.90 insn per cycle - 2.402310060 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.528530e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.865855e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.865855e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.845512 sec + 8,254,984,848 cycles # 2.896 GHz + 15,590,498,904 instructions # 1.89 insn per cycle + 2.850900280 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.448749e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.125723e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.125723e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.394319 sec - 8,124,895,108 cycles # 3.390 GHz - 15,136,296,147 instructions # 1.86 insn per cycle - 2.397299852 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.609279e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.018312e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018312e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.764714 sec + 8,215,374,590 cycles # 2.969 GHz + 15,429,066,515 instructions # 1.88 insn per cycle + 2.770036927 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.541933e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.127583e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.127583e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.954962 sec - 6,675,259,521 cycles # 3.411 GHz - 12,568,022,737 instructions # 1.88 insn per cycle - 1.957882207 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.648656e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.090784e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.090784e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.731162 sec + 6,615,238,340 cycles # 2.419 GHz + 12,862,797,254 instructions # 1.94 insn per cycle + 2.736410000 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 4dc37a29ce..f9e4000e6d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:09:08 +DATE: 2024-03-01_02:25:31 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.096943e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095054e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.337200e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.581297 sec + 2,416,875,461 cycles # 3.000 GHz + 3,802,904,431 instructions # 1.57 insn per cycle + 0.886522859 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.720434e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.100723e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.100723e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.265109 sec - 14,915,458,746 cycles # 3.495 GHz - 41,447,917,911 instructions # 2.78 insn per cycle - 4.268275411 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.237656e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.486670e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.486670e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.444566 sec + 16,726,225,777 cycles # 3.070 GHz + 41,270,625,621 instructions # 2.47 insn per cycle + 5.454849598 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.260922e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.026866e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.026866e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.519340 sec - 8,813,391,527 cycles # 3.495 GHz - 21,289,273,651 instructions # 2.42 insn per cycle - 2.522478814 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.460514e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.827007e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.827007e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.914617 sec + 8,996,783,237 cycles # 3.081 GHz + 21,210,998,059 instructions # 2.36 insn per cycle + 2.929493898 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.427524e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.128695e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.128695e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.394514 sec - 8,126,459,965 cycles # 3.390 GHz - 15,333,379,094 instructions # 1.89 insn per cycle - 2.397695798 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.611163e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022551e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.022551e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.760181 sec + 8,249,336,928 cycles # 2.983 GHz + 15,425,238,678 instructions # 1.87 insn per cycle + 2.778856529 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.456500e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.144948e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.144948e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.391151 sec - 8,113,737,929 cycles # 3.390 GHz - 14,939,981,190 instructions # 1.84 insn per cycle - 2.394304414 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.587140e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.018405e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018405e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.789811 sec + 8,096,556,575 cycles # 2.897 GHz + 15,238,891,903 instructions # 1.88 insn per cycle + 2.804859872 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.545530e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.144301e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.144301e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.956043 sec - 6,680,719,629 cycles # 3.411 GHz - 12,547,036,461 instructions # 1.88 insn per cycle - 1.959245995 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.644016e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.094854e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.094854e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.735992 sec + 6,623,617,660 cycles # 2.417 GHz + 12,843,079,376 instructions # 1.94 insn per cycle + 2.752411310 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052564145764E-002 Relative difference = 1.9988585667912256e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 5416692647..fde060de72 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:23:08 +DATE: 2024-03-01_03:03:05 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.224284e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181869e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.290244e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.576138 sec + 2,415,755,755 cycles # 3.001 GHz + 3,734,378,655 instructions # 1.55 insn per cycle + 0.864225849 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.220532e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.908131e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.908131e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.440500 sec - 12,037,718,902 cycles # 3.497 GHz - 32,611,773,785 instructions # 2.71 insn per cycle - 3.443737801 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.727035e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.251286e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.251286e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 4.002640 sec + 12,159,409,273 cycles # 3.035 GHz + 32,432,694,101 instructions # 2.67 insn per cycle + 4.008158303 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039840314887E-002 Relative difference = 1.244813035273009e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.735307e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.227346e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.227346e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.275375 sec - 7,963,357,113 cycles # 3.496 GHz - 18,736,043,078 instructions # 2.35 insn per cycle - 2.278622500 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.805511e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.765564e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.765564e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.601867 sec + 7,999,882,010 cycles # 3.069 GHz + 18,656,600,340 instructions # 2.33 insn per cycle + 2.607493343 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039283704129E-002 Relative difference = 5.583829420356249e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.874885e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.138866e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.138866e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.180630 sec - 7,418,462,702 cycles # 3.398 GHz - 14,160,332,853 instructions # 1.91 insn per cycle - 2.183917049 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.939924e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.842069e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.842069e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.492780 sec + 7,427,313,914 cycles # 2.974 GHz + 14,251,086,474 instructions # 1.92 insn per cycle + 2.498394316 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053244447801E-002 Relative difference = 2.5291823782248813e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.068100e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.691798e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.691798e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.120331 sec - 7,409,259,501 cycles # 3.490 GHz - 13,649,785,865 instructions # 1.84 insn per cycle - 2.123584431 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.004272e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.034488e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.034488e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.444620 sec + 7,299,238,549 cycles # 2.980 GHz + 13,947,633,533 instructions # 1.91 insn per cycle + 2.450212772 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053244447801E-002 Relative difference = 2.5291823782248813e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.512909e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.081395e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.081395e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.966622 sec - 6,716,721,759 cycles # 3.411 GHz - 13,128,043,043 instructions # 1.95 insn per cycle - 1.969885224 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.706121e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.223606e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.223606e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.681955 sec + 6,492,318,128 cycles # 2.417 GHz + 13,422,094,611 instructions # 2.07 insn per cycle + 2.687432186 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052562326775E-002 Relative difference = 1.997440588685788e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index ad73c3d757..0d6d3b3db1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:23:25 +DATE: 2024-03-01_03:03:32 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.215876e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204111e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.337047e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.576922 sec + 2,404,705,116 cycles # 2.985 GHz + 3,758,296,111 instructions # 1.56 insn per cycle + 0.864210592 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.058716e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.506875e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.506875e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 2.666546 sec - 9,327,231,220 cycles # 3.496 GHz - 25,447,900,546 instructions # 2.73 insn per cycle - 2.669764195 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.296714e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.359904e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.359904e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.097656 sec + 9,472,450,742 cycles # 3.053 GHz + 25,268,175,697 instructions # 2.67 insn per cycle + 3.103042436 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039838495897E-002 Relative difference = 1.2589928273811243e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.323765e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.058451e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.058451e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.046425 sec - 7,160,329,051 cycles # 3.495 GHz - 16,947,102,157 instructions # 2.37 insn per cycle - 2.049696854 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.079795e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.704088e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.704088e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.415041 sec + 7,164,638,851 cycles # 2.961 GHz + 16,869,197,703 instructions # 2.35 insn per cycle + 2.420723497 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.297201e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.290316e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.290316e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.022038 sec - 7,066,919,623 cycles # 3.491 GHz - 13,525,128,890 instructions # 1.91 insn per cycle - 2.025234558 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.078168e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.319472e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.319472e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.394138 sec + 7,165,321,711 cycles # 2.987 GHz + 13,616,190,038 instructions # 1.90 insn per cycle + 2.399577311 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053220800939E-002 Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.389695e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.530798e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.530798e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.007329 sec - 7,015,071,750 cycles # 3.491 GHz - 13,133,210,306 instructions # 1.87 insn per cycle - 2.010465766 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.136069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.411751e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.411751e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.363661 sec + 7,031,964,685 cycles # 2.970 GHz + 13,425,613,371 instructions # 1.91 insn per cycle + 2.369281481 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053220800939E-002 Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.659878e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.534461e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.534461e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.922796 sec - 6,566,422,271 cycles # 3.411 GHz - 12,859,737,400 instructions # 1.96 insn per cycle - 1.926087575 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.811199e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.477443e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.477443e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.592425 sec + 6,321,858,831 cycles # 2.434 GHz + 13,153,560,775 instructions # 2.08 insn per cycle + 2.597985755 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052536860923E-002 Relative difference = 1.977588895209662e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 837cf18a48..4be3e76490 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:09:26 +DATE: 2024-03-01_02:26:01 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.449419e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.301374e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.190967e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.717219 sec + 2,841,227,385 cycles # 2.957 GHz + 4,430,504,412 instructions # 1.56 insn per cycle + 1.049815549 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.377191e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.581422e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.581422e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.191610 sec - 18,159,288,198 cycles # 3.496 GHz - 44,323,801,591 instructions # 2.44 insn per cycle - 5.194897716 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.109294e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.297854e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.297854e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.069129 sec + 18,728,354,553 cycles # 3.083 GHz + 44,224,513,518 instructions # 2.36 insn per cycle + 6.079869673 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.293424e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.965077e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.965077e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.349210 sec - 11,716,680,806 cycles # 3.496 GHz - 30,916,285,925 instructions # 2.64 insn per cycle - 3.352547582 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.745615e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.315952e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.315952e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.001256 sec + 12,323,242,096 cycles # 3.075 GHz + 30,917,838,115 instructions # 2.51 insn per cycle + 4.017904894 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.736734e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.720058e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.720058e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.874813 sec - 9,725,993,159 cycles # 3.380 GHz - 19,272,349,716 instructions # 1.98 insn per cycle - 2.878123285 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.078908e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.902249e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.902249e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.416443 sec + 10,120,877,504 cycles # 2.958 GHz + 19,374,733,180 instructions # 1.91 insn per cycle + 3.431641491 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.837062e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.944737e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.944737e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.808701 sec - 9,507,947,404 cycles # 3.382 GHz - 18,641,360,147 instructions # 1.96 insn per cycle - 2.812001364 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.114347e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.979731e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.979731e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.374976 sec + 9,706,052,635 cycles # 2.871 GHz + 18,944,519,271 instructions # 1.95 insn per cycle + 3.395274500 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.604258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.479437e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.479437e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.334946 sec - 7,938,052,698 cycles # 3.396 GHz - 14,749,724,015 instructions # 1.86 insn per cycle - 2.338250895 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.874531e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.524823e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.524823e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.760847 sec + 8,409,257,244 cycles # 2.233 GHz + 15,057,436,319 instructions # 1.79 insn per cycle + 3.776930410 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 076a22f416..77001f8935 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:09:47 +DATE: 2024-03-01_02:26:35 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.443987e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.284127e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.143740e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.699538 sec + 2,805,342,043 cycles # 2.999 GHz + 4,414,010,673 instructions # 1.57 insn per cycle + 1.020206687 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.451232e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.679621e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.679621e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.958423 sec - 17,349,182,496 cycles # 3.497 GHz - 42,572,483,064 instructions # 2.45 insn per cycle - 4.961760502 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.155620e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.358194e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.358194e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.837265 sec + 18,090,198,997 cycles # 3.097 GHz + 42,472,863,850 instructions # 2.35 insn per cycle + 5.848007644 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.362004e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.081863e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.081863e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.272026 sec - 11,447,774,045 cycles # 3.496 GHz - 30,223,060,667 instructions # 2.64 insn per cycle - 3.275374923 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.786116e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.385279e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.385279e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.920672 sec + 12,137,736,337 cycles # 3.092 GHz + 30,225,042,392 instructions # 2.49 insn per cycle + 3.938311189 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.762711e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.775192e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.775192e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.854465 sec - 9,656,821,540 cycles # 3.380 GHz - 19,154,909,313 instructions # 1.98 insn per cycle - 2.857812496 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.068049e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.882124e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.882124e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.437770 sec + 10,015,371,277 cycles # 2.909 GHz + 19,256,811,213 instructions # 1.92 insn per cycle + 3.454377757 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.912614e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.047841e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.047841e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.751170 sec - 9,315,104,434 cycles # 3.383 GHz - 18,442,592,455 instructions # 1.98 insn per cycle - 2.754516121 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.207913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.137874e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.137874e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.235635 sec + 9,645,810,411 cycles # 2.976 GHz + 18,756,051,671 instructions # 1.94 insn per cycle + 3.251774736 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.659029e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.577201e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.577201e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.308570 sec - 7,854,338,821 cycles # 3.399 GHz - 14,673,429,400 instructions # 1.87 insn per cycle - 2.311927939 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.969792e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.680976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.680976e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.592139 sec + 8,293,535,644 cycles # 2.305 GHz + 14,979,176,568 instructions # 1.81 insn per cycle + 3.613399615 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 38908c3fb1..9a5df19d5b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:10:08 +DATE: 2024-03-01_02:27:08 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.025930e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.135524e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271935e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.535145 sec + 2,303,454,226 cycles # 2.990 GHz + 3,249,200,622 instructions # 1.41 insn per cycle + 0.848848936 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.782462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.856967e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.856967e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.897327 sec - 13,634,239,091 cycles # 3.496 GHz - 38,385,444,668 instructions # 2.82 insn per cycle - 3.900701537 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.185653e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.250591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.250591e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.903669 sec + 15,175,795,116 cycles # 3.093 GHz + 38,374,949,840 instructions # 2.53 insn per cycle + 4.917105673 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.802582e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.042466e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.042466e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.309068 sec - 8,081,737,607 cycles # 3.496 GHz - 24,573,373,219 instructions # 3.04 insn per cycle - 2.312471054 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.662249e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.860778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.860778e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.968890 sec + 9,101,848,873 cycles # 3.060 GHz + 24,578,505,710 instructions # 2.70 insn per cycle + 2.986159008 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.804161e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.442356e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.442356e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.464920 sec - 4,882,777,358 cycles # 3.328 GHz - 11,230,055,186 instructions # 2.30 insn per cycle - 1.468368837 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.728560e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.222175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.222175e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.936093 sec + 5,474,671,571 cycles # 2.819 GHz + 11,252,385,098 instructions # 2.06 insn per cycle + 1.954008279 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.365690e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.101366e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.101366e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.377099 sec - 4,595,020,426 cycles # 3.330 GHz - 10,504,303,399 instructions # 2.29 insn per cycle - 1.380458418 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.292169e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.895497e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.895497e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.774092 sec + 4,972,729,611 cycles # 2.794 GHz + 10,557,445,760 instructions # 2.12 insn per cycle + 1.789622209 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.158651e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.839395e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.839395e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.408011 sec - 4,692,884,605 cycles # 3.327 GHz - 7,735,219,640 instructions # 1.65 insn per cycle - 1.411361186 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.894024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.109310e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109310e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.799185 sec + 5,395,066,029 cycles # 1.924 GHz + 7,793,871,634 instructions # 1.44 insn per cycle + 2.817161041 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 3f9ce1ce83..598396a8e7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,219 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:29:40 +DATE: 2024-03-01_03:14:07 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.569533e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.877038e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.877038e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.801549 sec + 3,157,604,220 cycles # 3.025 GHz + 4,827,294,021 instructions # 1.53 insn per cycle + 1.101037847 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.768798e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.843131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.843131e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.955815 sec - 13,832,169,175 cycles # 3.494 GHz - 38,450,905,414 instructions # 2.78 insn per cycle - 3.959789848 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.171920e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.234476e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.234476e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.008942 sec + 15,497,351,856 cycles # 3.090 GHz + 38,433,512,801 instructions # 2.48 insn per cycle + 5.015755142 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.744115e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.977846e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.977846e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.391627 sec - 8,365,963,654 cycles # 3.493 GHz - 24,752,915,009 instructions # 2.96 insn per cycle - 2.395600644 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.610749e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.808660e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.808660e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.090616 sec + 9,430,020,802 cycles # 3.049 GHz + 24,763,068,407 instructions # 2.63 insn per cycle + 3.097621879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.710929e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.332597e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.332597e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.535175 sec - 5,133,184,909 cycles # 3.337 GHz - 11,512,737,529 instructions # 2.24 insn per cycle - 1.539209812 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.825746e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.328246e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.328246e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.984017 sec + 5,826,620,771 cycles # 2.928 GHz + 11,538,062,844 instructions # 1.98 insn per cycle + 1.990946794 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.182780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.885497e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.885497e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.458979 sec - 4,882,301,300 cycles # 3.339 GHz - 10,787,066,129 instructions # 2.21 insn per cycle - 1.463011407 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.484023e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.101551e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.101551e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.799262 sec + 5,294,562,816 cycles # 2.933 GHz + 10,843,404,980 instructions # 2.05 insn per cycle + 1.806082483 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.015903e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.677662e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.677662e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.490826 sec - 4,984,106,886 cycles # 3.336 GHz - 7,975,592,103 instructions # 1.60 insn per cycle - 1.494900312 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.045937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.276782e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.276782e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.778138 sec + 5,743,518,580 cycles # 2.063 GHz + 8,037,207,687 instructions # 1.40 insn per cycle + 2.785184310 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index bbbf7c9fcf..977053e874 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:35:21 +DATE: 2024-03-01_03:27:17 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.571348e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154956e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272098e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 0.617245 sec + 2,532,813,012 cycles # 2.999 GHz + 3,701,870,616 instructions # 1.46 insn per cycle + 0.904006340 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.781344e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.856090e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.856090e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.183394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.247420e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.247420e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.899236 sec - 13,635,452,338 cycles # 3.495 GHz - 38,385,453,862 instructions # 2.82 insn per cycle - 3.902350302 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.966854 sec + 15,343,121,883 cycles # 3.087 GHz + 38,390,661,623 instructions # 2.50 insn per cycle + 4.972403311 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.795094e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.033476e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.033476e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.599283e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.796561e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.796561e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.312363 sec - 8,089,982,420 cycles # 3.495 GHz - 24,573,716,651 instructions # 3.04 insn per cycle - 2.315558026 seconds time elapsed +TOTAL : 3.079495 sec + 9,279,730,828 cycles # 3.010 GHz + 24,577,932,954 instructions # 2.65 insn per cycle + 3.085060857 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.852044e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.495498e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.495498e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.908259e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.435116e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.435116e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.456479 sec - 4,857,277,999 cycles # 3.329 GHz - 11,230,242,008 instructions # 2.31 insn per cycle - 1.459674299 seconds time elapsed +TOTAL : 1.937503 sec + 5,654,473,993 cycles # 2.911 GHz + 11,233,989,199 instructions # 1.99 insn per cycle + 1.943141738 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.332356e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.056692e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.056692e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.578665e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.217153e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.217153e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.381870 sec - 4,611,557,382 cycles # 3.331 GHz - 10,504,643,617 instructions # 2.28 insn per cycle - 1.385061393 seconds time elapsed +TOTAL : 1.757396 sec + 5,128,637,723 cycles # 2.910 GHz + 10,505,547,256 instructions # 2.05 insn per cycle + 1.762900213 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.304542e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.010565e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.010565e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.070979e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.306684e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.306684e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.385516 sec - 4,620,699,813 cycles # 3.329 GHz - 7,735,038,876 instructions # 1.67 insn per cycle - 1.388698104 seconds time elapsed +TOTAL : 2.739915 sec + 5,558,468,681 cycles # 2.025 GHz + 7,741,606,815 instructions # 1.39 insn per cycle + 2.745378653 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index f421a264ca..29a670398e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,138 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:34:14 +DATE: 2024-03-01_03:24:00 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.579097e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155655e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270242e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.554530 sec + 2,358,271,315 cycles # 3.013 GHz + 3,682,090,929 instructions # 1.56 insn per cycle + 0.840283729 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe: Aborted - 4,520,752 cycles # 3.261 GHz - 6,298,367 instructions # 1.39 insn per cycle - 0.037814907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.177843e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.241689e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.241689e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.919493 sec + 15,156,700,875 cycles # 3.078 GHz + 38,373,397,442 instructions # 2.53 insn per cycle + 4.925048190 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe: Aborted - 4,515,960 cycles # 3.261 GHz - 6,261,612 instructions # 1.39 insn per cycle - 0.038098825 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.588081e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.785746e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.785746e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.028765 sec + 9,114,596,397 cycles # 3.011 GHz + 24,581,732,536 instructions # 2.70 insn per cycle + 3.034354491 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe: Aborted - 4,384,140 cycles # 2.671 GHz - 6,269,302 instructions # 1.43 insn per cycle - 0.039362811 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.938829e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.476997e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.476997e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.869884 sec + 5,467,539,853 cycles # 2.917 GHz + 11,251,237,475 instructions # 2.06 insn per cycle + 1.875504692 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe: Aborted - 4,828,863 cycles # 3.264 GHz - 6,315,171 instructions # 1.31 insn per cycle - 0.038710635 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.273575e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.896545e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.896545e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.780053 sec + 4,944,261,583 cycles # 2.770 GHz + 10,558,833,446 instructions # 2.14 insn per cycle + 1.785881884 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe: Aborted - 4,574,413 cycles # 2.717 GHz - 6,280,420 instructions # 1.37 insn per cycle - 0.038677314 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.090701e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.328087e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.328087e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.667720 sec + 5,371,754,599 cycles # 2.010 GHz + 7,792,372,952 instructions # 1.45 insn per cycle + 2.673339648 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 0b4ae759d7..e5cfc13b3e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,183 +1,208 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:32:56 +DATE: 2024-03-01_03:20:45 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.972409e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155179e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272541e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.697938 sec + 2,798,675,219 cycles # 3.021 GHz + 4,376,672,842 instructions # 1.56 insn per cycle + 0.983897382 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.782456e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.857581e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.857581e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.898425 sec - 13,637,022,234 cycles # 3.496 GHz - 38,385,249,434 instructions # 2.81 insn per cycle - 3.901702317 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.189575e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.254386e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.254386e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.893907 sec + 15,162,024,600 cycles # 3.096 GHz + 38,372,989,497 instructions # 2.53 insn per cycle + 4.899450957 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.798533e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.038408e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.038408e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.311197 sec - 8,087,896,887 cycles # 3.496 GHz - 24,573,667,331 instructions # 3.04 insn per cycle - 2.314383849 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.704548e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.907149e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.907149e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.935182 sec + 9,091,941,153 cycles # 3.094 GHz + 24,577,519,112 instructions # 2.70 insn per cycle + 2.940777194 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.865253e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.517718e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.517718e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.455280 sec - 4,850,997,989 cycles # 3.328 GHz - 11,230,074,042 instructions # 2.32 insn per cycle - 1.458527044 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.938740e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.466662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.466662e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.869802 sec + 5,458,289,042 cycles # 2.911 GHz + 11,250,961,339 instructions # 2.06 insn per cycle + 1.875881825 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.377231e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.116751e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.116751e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.375896 sec - 4,589,261,968 cycles # 3.330 GHz - 10,502,721,091 instructions # 2.29 insn per cycle - 1.379048846 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.493369e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.117845e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.117845e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.719311 sec + 5,034,836,824 cycles # 2.920 GHz + 10,558,271,294 instructions # 2.10 insn per cycle + 1.725057980 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.291081e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.994820e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.994820e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.387507 sec - 4,623,729,787 cycles # 3.327 GHz - 7,735,191,540 instructions # 1.67 insn per cycle - 1.390703747 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.013824e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.247297e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.247297e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.716839 sec + 5,403,556,568 cycles # 1.987 GHz + 7,794,191,095 instructions # 1.44 insn per cycle + 2.722528243 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 3c03138a4d..73356b00dd 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:10:23 +DATE: 2024-03-01_02:27:35 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.058566e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.139903e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277694e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.538743 sec + 2,297,794,086 cycles # 2.963 GHz + 3,276,125,304 instructions # 1.43 insn per cycle + 0.856267333 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.710529e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.781570e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.781570e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.996408 sec - 13,979,413,584 cycles # 3.496 GHz - 40,196,760,217 instructions # 2.88 insn per cycle - 3.999779379 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 687) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.197217e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.262307e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.262307e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.877526 sec + 15,081,677,651 cycles # 3.089 GHz + 40,100,660,385 instructions # 2.66 insn per cycle + 4.889980594 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.077291e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.346738e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.346738e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.191848 sec - 7,669,887,605 cycles # 3.495 GHz - 23,666,455,211 instructions # 3.09 insn per cycle - 2.195224486 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.910252e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.135599e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.135599e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.787478 sec + 8,606,981,244 cycles # 3.082 GHz + 23,670,854,000 instructions # 2.75 insn per cycle + 2.801213189 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.056560e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.577761e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.577761e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.608065 sec - 5,356,503,720 cycles # 3.326 GHz - 13,039,165,916 instructions # 2.43 insn per cycle - 1.611516596 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.287623e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.696089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.696089e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.088271 sec + 6,101,163,180 cycles # 2.915 GHz + 13,060,965,379 instructions # 2.14 insn per cycle + 2.110411764 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.286280e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.839274e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.839274e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.563652 sec - 5,212,654,568 cycles # 3.328 GHz - 12,266,825,956 instructions # 2.35 insn per cycle - 1.567147417 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.510708e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.955656e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.955656e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.007458 sec + 5,795,313,103 cycles # 2.878 GHz + 12,320,114,352 instructions # 2.13 insn per cycle + 2.035740422 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.906483e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.553389e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.553389e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.449150 sec - 4,824,305,914 cycles # 3.323 GHz - 9,542,657,586 instructions # 1.98 insn per cycle - 1.452571538 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.559784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.746127e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.746127e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.054998 sec + 5,836,990,709 cycles # 1.908 GHz + 9,601,704,067 instructions # 1.64 insn per cycle + 3.069883688 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 25ef1d5f3d..7ca7ca6f27 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:23:40 +DATE: 2024-03-01_03:03:58 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.566149e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.156976e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274435e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.520509 sec + 2,251,864,611 cycles # 2.979 GHz + 3,200,076,053 instructions # 1.42 insn per cycle + 0.813049887 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.192021e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.290805e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.290805e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.415579 sec - 11,947,926,572 cycles # 3.495 GHz - 34,397,405,204 instructions # 2.88 insn per cycle - 3.419004365 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.538728e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.625778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.625778e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.235724 sec + 13,018,811,907 cycles # 3.070 GHz + 34,384,492,801 instructions # 2.64 insn per cycle + 4.241723051 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.733586e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.879812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.879812e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.934771 sec - 10,271,141,406 cycles # 3.497 GHz - 24,003,451,169 instructions # 2.34 insn per cycle - 2.938285168 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.065411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.209741e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.209741e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.527791 sec + 10,618,068,276 cycles # 3.005 GHz + 24,006,297,751 instructions # 2.26 insn per cycle + 3.533644608 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.994183e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.364715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.364715e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.870369 sec - 6,224,279,915 cycles # 3.323 GHz - 12,380,406,144 instructions # 1.99 insn per cycle - 1.873883038 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.845204e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.186466e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.186466e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.268558 sec + 6,594,099,256 cycles # 2.900 GHz + 12,400,446,525 instructions # 1.88 insn per cycle + 2.274329127 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516200 Relative difference = 3.2588037208240405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.449558e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.878567e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.878567e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.751343 sec - 5,830,240,093 cycles # 3.324 GHz - 11,520,328,697 instructions # 1.98 insn per cycle - 1.754890337 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.148118e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.537652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.537652e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.142175 sec + 6,250,159,272 cycles # 2.911 GHz + 11,574,474,977 instructions # 1.85 insn per cycle + 2.148019416 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516200 Relative difference = 3.2588037208240405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.956146e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.616868e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.616868e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.441128 sec - 4,799,320,936 cycles # 3.324 GHz - 9,238,083,496 instructions # 1.92 insn per cycle - 1.444627570 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.139590e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.381511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.381511e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.637824 sec + 5,343,225,675 cycles # 2.022 GHz + 9,294,792,947 instructions # 1.74 insn per cycle + 2.643638198 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 94cf15e1a9..6740b658ab 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:23:56 +DATE: 2024-03-01_03:04:25 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.563128e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158314e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275634e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.525125 sec + 2,266,508,632 cycles # 2.999 GHz + 3,227,683,893 instructions # 1.42 insn per cycle + 0.815560561 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.402676e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.515698e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.515698e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.211500 sec - 11,235,460,343 cycles # 3.496 GHz - 35,050,056,722 instructions # 3.12 insn per cycle - 3.214868359 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.686393e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.784184e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.784184e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.008193 sec + 12,350,315,150 cycles # 3.077 GHz + 35,037,181,267 instructions # 2.84 insn per cycle + 4.014100641 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.645770e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.783620e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.783620e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.003938 sec - 10,512,340,009 cycles # 3.496 GHz - 23,080,735,378 instructions # 2.20 insn per cycle - 3.007803609 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.126314e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.271590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.271590e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.458899 sec + 10,688,048,117 cycles # 3.085 GHz + 23,082,662,787 instructions # 2.16 insn per cycle + 3.464737128 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.439627e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.869125e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.869125e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.750278 sec - 5,827,983,221 cycles # 3.325 GHz - 11,935,795,493 instructions # 2.05 insn per cycle - 1.753728973 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.065386e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.447820e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.447820e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.175532 sec + 6,167,789,524 cycles # 2.829 GHz + 11,956,365,830 instructions # 1.94 insn per cycle + 2.181490352 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.801848e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.279461e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.279461e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.666561 sec - 5,551,706,657 cycles # 3.326 GHz - 11,073,468,618 instructions # 1.99 insn per cycle - 1.670062494 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.355284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.776167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.776167e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.062589 sec + 6,012,687,929 cycles # 2.908 GHz + 11,129,506,913 instructions # 1.85 insn per cycle + 2.068524285 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.282360e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.994130e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.994130e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.389411 sec - 4,628,373,888 cycles # 3.325 GHz - 8,962,575,488 instructions # 1.94 insn per cycle - 1.392904800 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.234665e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.489644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.489644e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.581777 sec + 5,215,223,845 cycles # 2.016 GHz + 9,019,923,506 instructions # 1.73 insn per cycle + 2.587755549 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index a7acc63d87..3164378b7a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:10:38 +DATE: 2024-03-01_02:28:04 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.210726e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.585567e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.966482e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.485254 sec + 2,068,141,298 cycles # 2.904 GHz + 2,916,142,359 instructions # 1.41 insn per cycle + 0.784434250 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.061036e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.162907e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.162907e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.542085 sec - 12,391,424,054 cycles # 3.496 GHz - 38,265,921,403 instructions # 3.09 insn per cycle - 3.545513591 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.313091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.389644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.389644e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.621612 sec + 14,026,409,554 cycles # 3.032 GHz + 38,341,238,705 instructions # 2.73 insn per cycle + 4.632085783 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.565544e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.063647e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.063647e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.710778 sec - 5,991,801,017 cycles # 3.497 GHz - 15,824,708,814 instructions # 2.64 insn per cycle - 1.714001105 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.217740e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.647077e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.647077e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.094155 sec + 6,477,656,873 cycles # 3.085 GHz + 15,815,714,256 instructions # 2.44 insn per cycle + 2.109661469 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.241504e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.408345e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.408345e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.952958 sec - 3,189,843,324 cycles # 3.339 GHz - 7,575,039,950 instructions # 2.37 insn per cycle - 0.956152471 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.558089e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.098648e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.098648e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.180439 sec + 3,464,791,228 cycles # 2.924 GHz + 7,594,553,534 instructions # 2.19 insn per cycle + 1.196926932 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.319510e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.510809e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.510809e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.906095 sec - 3,034,230,865 cycles # 3.340 GHz - 7,150,811,632 instructions # 2.36 insn per cycle - 0.909351723 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.028669e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.195924e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.195924e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.103361 sec + 3,253,544,502 cycles # 2.935 GHz + 7,202,500,133 instructions # 2.21 insn per cycle + 1.115792553 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.446120e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.684000e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.684000e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.836297 sec - 2,802,120,149 cycles # 3.341 GHz - 5,780,771,468 instructions # 2.06 insn per cycle - 0.839556975 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.586127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.450667e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.450667e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.467307 sec + 3,062,229,633 cycles # 2.079 GHz + 5,834,823,887 instructions # 1.91 insn per cycle + 1.480044473 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 244f7ed452..b32abcb3fe 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,219 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:29:55 +DATE: 2024-03-01_03:14:35 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.139226e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.486374e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.486374e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.665285 sec + 2,679,931,908 cycles # 3.001 GHz + 4,173,181,221 instructions # 1.56 insn per cycle + 0.950193790 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.053680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.154942e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.154942e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.570943 sec - 12,491,119,184 cycles # 3.495 GHz - 38,309,264,755 instructions # 3.07 insn per cycle - 3.574697502 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.339175e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.415593e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.415593e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.608146 sec + 14,198,803,048 cycles # 3.078 GHz + 38,383,841,480 instructions # 2.70 insn per cycle + 4.614561058 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.494756e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.984731e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.984731e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.760397 sec - 6,151,072,141 cycles # 3.488 GHz - 16,106,098,708 instructions # 2.62 insn per cycle - 1.764265698 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.150361e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.574288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.574288e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.164951 sec + 6,682,648,138 cycles # 3.079 GHz + 16,095,511,662 instructions # 2.41 insn per cycle + 2.171478460 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.220386e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.382151e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.382151e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.999164 sec - 3,353,390,197 cycles # 3.346 GHz - 7,813,166,304 instructions # 2.33 insn per cycle - 1.003041613 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.377335e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.075060e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.075060e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.245724 sec + 3,655,872,382 cycles # 2.921 GHz + 7,830,960,228 instructions # 2.14 insn per cycle + 1.252058919 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.301043e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.484531e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.484531e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.948601 sec - 3,185,245,475 cycles # 3.348 GHz - 7,388,936,707 instructions # 2.32 insn per cycle - 0.952384757 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.884024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.146718e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.146718e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.193275 sec + 3,439,455,837 cycles # 2.869 GHz + 7,440,735,686 instructions # 2.16 insn per cycle + 1.199824293 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.412343e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.637273e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.637273e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.886349 sec - 2,975,186,907 cycles # 3.345 GHz - 6,036,018,349 instructions # 2.03 insn per cycle - 0.890113209 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.445766e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.274506e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.274506e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.539244 sec + 3,276,504,779 cycles # 2.121 GHz + 6,089,433,455 instructions # 1.86 insn per cycle + 1.545785864 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index d4cdb5a44f..1418229a2f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:35:36 +DATE: 2024-03-01_03:27:44 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.472574e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.636713e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.962164e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 +TOTAL : 0.558880 sec + 2,364,095,478 cycles # 3.003 GHz + 3,484,344,192 instructions # 1.47 insn per cycle + 0.845198156 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.058975e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.161124e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.161124e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.358072e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.436073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.436073e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.544825 sec - 12,394,762,036 cycles # 3.494 GHz - 38,265,645,755 instructions # 3.09 insn per cycle - 3.547916257 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.585598 sec + 14,172,267,813 cycles # 3.088 GHz + 38,370,669,897 instructions # 2.71 insn per cycle + 4.590984697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.488571e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.974371e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.974371e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.211957e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.640936e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.640936e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.729685 sec - 6,044,983,083 cycles # 3.490 GHz - 15,825,361,332 instructions # 2.62 insn per cycle - 1.732686150 seconds time elapsed +TOTAL : 2.148796 sec + 6,634,619,629 cycles # 3.081 GHz + 15,827,825,218 instructions # 2.39 insn per cycle + 2.154083020 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.240146e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.410041e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.410041e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.547921e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095970e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.095970e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.954496 sec - 3,193,272,434 cycles # 3.337 GHz - 7,575,303,386 instructions # 2.37 insn per cycle - 0.957550268 seconds time elapsed +TOTAL : 1.236002 sec + 3,624,228,310 cycles # 2.921 GHz + 7,577,923,207 instructions # 2.09 insn per cycle + 1.241371528 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.319772e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.511858e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.511858e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.019099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183109e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.183109e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.905312 sec - 3,033,184,068 cycles # 3.342 GHz - 7,150,791,405 instructions # 2.36 insn per cycle - 0.908396432 seconds time elapsed +TOTAL : 1.166800 sec + 3,412,475,771 cycles # 2.913 GHz + 7,154,107,852 instructions # 2.10 insn per cycle + 1.172143118 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.446800e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.685435e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.685435e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.590832e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.447342e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.447342e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.835810 sec - 2,801,569,225 cycles # 3.343 GHz - 5,780,758,142 instructions # 2.06 insn per cycle - 0.838834457 seconds time elapsed +TOTAL : 1.519807 sec + 3,228,336,001 cycles # 2.118 GHz + 5,784,936,071 instructions # 1.79 insn per cycle + 1.525231071 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 54a85a955c..6cc1ea482a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,138 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:34:19 +DATE: 2024-03-01_03:24:27 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.444388e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.637591e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.958095e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.506234 sec + 2,151,061,698 cycles # 2.964 GHz + 3,317,932,316 instructions # 1.54 insn per cycle + 0.783859096 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe: Aborted - 4,538,068 cycles # 3.245 GHz - 6,262,611 instructions # 1.38 insn per cycle - 0.037803610 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.348187e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.425786e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.425786e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.550945 sec + 14,020,959,724 cycles # 3.078 GHz + 38,340,893,799 instructions # 2.73 insn per cycle + 4.556370309 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe: Aborted - 4,534,363 cycles # 3.253 GHz - 6,314,690 instructions # 1.39 insn per cycle - 0.037873298 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.084306e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.497288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.497288e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.146528 sec + 6,470,246,026 cycles # 3.008 GHz + 15,815,477,798 instructions # 2.44 insn per cycle + 2.151761392 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe: Aborted - 4,637,775 cycles # 3.255 GHz - 6,328,952 instructions # 1.36 insn per cycle - 0.038380049 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.654131e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.108425e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108425e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.168173 sec + 3,446,745,579 cycles # 2.939 GHz + 7,593,552,481 instructions # 2.20 insn per cycle + 1.173417445 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe: Aborted - 4,573,961 cycles # 3.257 GHz - 6,321,016 instructions # 1.38 insn per cycle - 0.037677949 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.035097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201064e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.201064e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.096134 sec + 3,246,063,667 cycles # 2.949 GHz + 7,201,559,823 instructions # 2.22 insn per cycle + 1.101526557 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe: Aborted - 4,502,378 cycles # 3.260 GHz - 6,322,124 instructions # 1.40 insn per cycle - 0.037792372 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.601752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.455480e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.455480e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.463312 sec + 3,061,733,109 cycles # 2.086 GHz + 5,833,735,363 instructions # 1.91 insn per cycle + 1.468683964 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index c1a03aebbd..d1c301e36a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,183 +1,208 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:33:11 +DATE: 2024-03-01_03:21:13 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.521212e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.620937e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.942141e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.625420 sec + 2,414,961,393 cycles # 2.854 GHz + 3,791,061,685 instructions # 1.57 insn per cycle + 0.904442863 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.061154e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.162932e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.162932e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.542051 sec - 12,388,794,866 cycles # 3.495 GHz - 38,264,992,582 instructions # 3.09 insn per cycle - 3.545107831 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.328946e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.404018e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.404018e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.586154 sec + 14,183,213,679 cycles # 3.090 GHz + 38,341,040,102 instructions # 2.70 insn per cycle + 4.591510537 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.542448e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.037442e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.037442e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.716038 sec - 6,003,936,148 cycles # 3.494 GHz - 15,824,627,159 instructions # 2.64 insn per cycle - 1.719029866 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.242078e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.670922e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.670922e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.084805 sec + 6,467,654,599 cycles # 3.095 GHz + 15,814,952,627 instructions # 2.45 insn per cycle + 2.090234852 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.242773e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.410199e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.410199e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.952242 sec - 3,187,363,454 cycles # 3.339 GHz - 7,575,422,549 instructions # 2.38 insn per cycle - 0.955261247 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.553311e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.096092e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.096092e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.181028 sec + 3,453,301,700 cycles # 2.913 GHz + 7,593,575,205 instructions # 2.20 insn per cycle + 1.186225517 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.320310e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.510744e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.510744e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.905497 sec - 3,033,662,646 cycles # 3.342 GHz - 7,150,728,059 instructions # 2.36 insn per cycle - 0.908547463 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.023252e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188398e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188398e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.108864 sec + 3,247,038,827 cycles # 2.916 GHz + 7,202,168,264 instructions # 2.22 insn per cycle + 1.114391762 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.448049e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.685618e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.685618e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.835256 sec - 2,796,643,852 cycles # 3.341 GHz - 5,780,733,558 instructions # 2.07 insn per cycle - 0.838315041 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.596256e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.449431e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.449431e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.464294 sec + 3,059,603,183 cycles # 2.083 GHz + 5,833,854,527 instructions # 1.91 insn per cycle + 1.469681735 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 8b91db0e17..adc2ed2114 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:10:51 +DATE: 2024-03-01_02:28:27 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.323457e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.629602e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.019308e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.480923 sec + 2,116,431,851 cycles # 3.003 GHz + 3,022,655,895 instructions # 1.43 insn per cycle + 0.777218279 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.113653e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.221073e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.221073e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.483745 sec - 12,191,233,427 cycles # 3.497 GHz - 39,807,639,093 instructions # 3.27 insn per cycle - 3.486969433 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 580) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.299655e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.373045e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.373045e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.644587 sec + 14,360,257,758 cycles # 3.089 GHz + 39,833,716,550 instructions # 2.77 insn per cycle + 4.652300252 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199028000236 Relative difference = 4.790961076489297e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.134346e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.923260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.923260e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.402390 sec - 4,912,248,574 cycles # 3.497 GHz - 15,294,202,423 instructions # 3.11 insn per cycle - 1.405648629 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.819246e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.374211e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.374211e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.888755 sec + 5,601,188,109 cycles # 2.957 GHz + 15,285,931,975 instructions # 2.73 insn per cycle + 1.901754882 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.083568e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.948403e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.948403e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.262664 sec - 4,211,226,733 cycles # 3.329 GHz - 9,715,508,285 instructions # 2.31 insn per cycle - 1.265920918 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.809980e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.511061e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.511061e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.623137 sec + 4,755,173,593 cycles # 2.919 GHz + 9,735,141,159 instructions # 2.05 insn per cycle + 1.639641207 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182108197361 Relative difference = 1.0391259163456515e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.245949e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.013771e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.013771e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.245932 sec - 4,154,573,718 cycles # 3.329 GHz - 9,273,579,239 instructions # 2.23 insn per cycle - 1.249125651 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.976796e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.708401e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.708401e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.586631 sec + 4,632,931,570 cycles # 2.912 GHz + 9,326,747,974 instructions # 2.01 insn per cycle + 1.599475417 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182108197361 Relative difference = 1.0391259163456515e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.182229e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.339411e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.339411e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.997929 sec - 3,329,282,294 cycles # 3.328 GHz - 6,979,885,802 instructions # 2.10 insn per cycle - 1.001225403 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.246902e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.812329e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.812329e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.762945 sec + 3,668,593,409 cycles # 2.074 GHz + 7,034,535,336 instructions # 1.92 insn per cycle + 1.779301540 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2605) (512y: 12) (512z: 2221) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183459779248 Relative difference = 1.7053177021099307e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 7b61726f5d..82aee2242c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:24:11 +DATE: 2024-03-01_03:04:53 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.193238e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.649659e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.969705e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.478757 sec + 2,104,839,063 cycles # 2.996 GHz + 2,995,662,279 instructions # 1.42 insn per cycle + 0.760483148 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.274947e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.391721e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.391721e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.318252 sec - 11,611,007,117 cycles # 3.497 GHz - 34,397,967,949 instructions # 2.96 insn per cycle - 3.321458141 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.482809e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.574079e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.574079e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.311067 sec + 12,598,770,011 cycles # 2.919 GHz + 34,372,549,657 instructions # 2.73 insn per cycle + 4.316594695 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199094356969 Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.009263e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.579301e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.579301e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.609519 sec - 5,634,626,051 cycles # 3.496 GHz - 14,869,650,453 instructions # 2.64 insn per cycle - 1.612769577 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.536780e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.027176e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.027176e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.978899 sec + 6,105,197,866 cycles # 3.078 GHz + 14,859,942,037 instructions # 2.43 insn per cycle + 1.984598314 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193803280592 Relative difference = 1.8746278463897685e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.363912e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.029612e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.029612e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.228350 sec - 4,099,462,044 cycles # 3.331 GHz - 9,010,087,189 instructions # 2.20 insn per cycle - 1.231663795 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.439196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.305375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.305375e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.494763 sec + 4,316,279,907 cycles # 2.878 GHz + 9,028,948,283 instructions # 2.09 insn per cycle + 1.500523975 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181999931112 Relative difference = 9.857617164523888e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.733550e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.074728e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.074728e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.188251 sec - 3,967,821,126 cycles # 3.332 GHz - 8,612,196,683 instructions # 2.17 insn per cycle - 1.191524344 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.366245e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.235578e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.235578e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.509333 sec + 4,207,142,397 cycles # 2.778 GHz + 8,663,183,236 instructions # 2.06 insn per cycle + 1.515104262 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181999931112 Relative difference = 9.857617164523888e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.076849e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.204629e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.204629e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.084935 sec - 3,621,365,276 cycles # 3.330 GHz - 7,753,075,474 instructions # 2.14 insn per cycle - 1.088240742 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.816959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.308753e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.308753e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.886655 sec + 3,832,564,290 cycles # 2.026 GHz + 7,807,000,610 instructions # 2.04 insn per cycle + 1.892395760 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4419) (512y: 0) (512z: 2556) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183246739209 Relative difference = 1.6003107281264138e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 61d1db8a51..dda1db1b3c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:24:24 +DATE: 2024-03-01_03:05:16 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.270822e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.690662e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.026451e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.478497 sec + 2,092,584,267 cycles # 2.987 GHz + 2,982,481,806 instructions # 1.43 insn per cycle + 0.759974164 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.548926e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.686522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.686522e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.071996 sec - 10,748,097,201 cycles # 3.497 GHz - 35,134,305,151 instructions # 3.27 insn per cycle - 3.075247886 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.703982e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.806761e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.806761e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 3.962914 sec + 11,745,545,496 cycles # 2.960 GHz + 35,108,793,810 instructions # 2.99 insn per cycle + 3.968579892 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199094356969 Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.177278e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.785505e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.785505e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.574710 sec - 5,513,686,982 cycles # 3.496 GHz - 14,479,421,661 instructions # 2.63 insn per cycle - 1.578018917 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.697555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.224866e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.224866e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.925244 sec + 5,962,598,726 cycles # 3.089 GHz + 14,469,931,867 instructions # 2.43 insn per cycle + 1.931094914 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193583255634 Relative difference = 1.7661780742548925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.002551e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.111042e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.111042e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.154884 sec - 3,857,962,551 cycles # 3.333 GHz - 8,855,937,803 instructions # 2.30 insn per cycle - 1.158168839 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.546151e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.447291e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.447291e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.475701 sec + 4,155,772,808 cycles # 2.809 GHz + 8,874,967,057 instructions # 2.14 insn per cycle + 1.481449825 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182107033208 Relative difference = 1.0385521077446488e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.549625e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.050633e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.050633e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.209387 sec - 4,037,720,483 cycles # 3.332 GHz - 8,359,219,935 instructions # 2.07 insn per cycle - 1.212702560 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.932743e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.882289e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.882289e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.405788 sec + 4,123,527,517 cycles # 2.923 GHz + 8,411,119,259 instructions # 2.04 insn per cycle + 1.411551419 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182107033208 Relative difference = 1.0385521077446488e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.088722e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.221112e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.221112e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.073794 sec - 3,584,683,392 cycles # 3.331 GHz - 7,646,262,060 instructions # 2.13 insn per cycle - 1.077090879 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.930692e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.444813e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.444813e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.851731 sec + 3,787,634,254 cycles # 2.040 GHz + 7,699,934,932 instructions # 2.03 insn per cycle + 1.857323010 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3435) (512y: 0) (512z: 2108) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183204829693 Relative difference = 1.5796536184903122e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 4c26b84629..9748a5aab4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:11:04 +DATE: 2024-03-01_02:28:51 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.029545e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.136839e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273391e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.526886 sec + 2,307,341,508 cycles # 3.024 GHz + 3,271,429,537 instructions # 1.42 insn per cycle + 0.836809323 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.726645e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.798326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.798326e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.974088 sec - 13,901,240,398 cycles # 3.496 GHz - 38,519,700,643 instructions # 2.77 insn per cycle - 3.977444803 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 678) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.174399e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.238464e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.238464e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.926720 sec + 15,303,062,403 cycles # 3.103 GHz + 38,574,821,235 instructions # 2.52 insn per cycle + 4.935986004 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.888376e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.137236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.137236e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.271352 sec - 7,950,613,495 cycles # 3.496 GHz - 24,219,096,177 instructions # 3.05 insn per cycle - 2.274776261 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.750432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.964332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.964332e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.903163 sec + 8,984,859,488 cycles # 3.089 GHz + 24,224,163,348 instructions # 2.70 insn per cycle + 2.918366508 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.937592e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.600519e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.600519e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.442766 sec - 4,812,419,685 cycles # 3.329 GHz - 11,255,351,127 instructions # 2.34 insn per cycle - 1.446223272 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.977342e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.518236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.518236e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.860423 sec + 5,396,289,064 cycles # 2.891 GHz + 11,276,510,611 instructions # 2.09 insn per cycle + 1.875091896 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.502777e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.260895e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.260895e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.356803 sec - 4,529,795,443 cycles # 3.332 GHz - 10,470,222,537 instructions # 2.31 insn per cycle - 1.360332208 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.792892e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.469147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.469147e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.648151 sec + 4,836,682,110 cycles # 2.924 GHz + 10,524,586,299 instructions # 2.18 insn per cycle + 1.662467551 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.442635e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.183690e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.183690e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.366107 sec - 4,556,171,739 cycles # 3.329 GHz - 7,545,350,102 instructions # 1.66 insn per cycle - 1.369560048 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.224142e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.479514e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.479514e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.587933 sec + 5,228,382,592 cycles # 2.016 GHz + 7,603,380,674 instructions # 1.45 insn per cycle + 2.604403134 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index ff51ce9edf..4c3bdeb3a7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:11:19 +DATE: 2024-03-01_02:29:18 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.025642e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.140563e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276898e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.529654 sec + 2,293,467,091 cycles # 2.992 GHz + 3,241,408,242 instructions # 1.41 insn per cycle + 0.836485234 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.696678e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.767700e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.767700e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.016604 sec - 14,052,871,002 cycles # 3.497 GHz - 40,347,826,232 instructions # 2.87 insn per cycle - 4.019950872 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 683) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.144775e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.207356e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.207356e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.994421 sec + 15,338,753,655 cycles # 3.068 GHz + 40,369,233,372 instructions # 2.63 insn per cycle + 5.002383718 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.079311e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.348964e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.348964e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.189998 sec - 7,660,292,335 cycles # 3.494 GHz - 23,249,177,880 instructions # 3.04 insn per cycle - 2.193387914 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.003325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.239627e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.239627e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.723159 sec + 8,478,435,163 cycles # 3.107 GHz + 23,253,497,249 instructions # 2.74 insn per cycle + 2.738604338 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.802190e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.284263e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.284263e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.662935 sec - 5,537,885,797 cycles # 3.326 GHz - 12,941,082,077 instructions # 2.34 insn per cycle - 1.666384267 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.181118e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.571113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.571113e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.127824 sec + 6,241,547,842 cycles # 2.925 GHz + 12,962,413,577 instructions # 2.08 insn per cycle + 2.144515260 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.133486e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.661662e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.661662e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.593841 sec - 5,310,193,730 cycles # 3.327 GHz - 12,185,581,580 instructions # 2.29 insn per cycle - 1.597248619 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.322331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.729304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.729304e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.074458 sec + 5,923,278,346 cycles # 2.853 GHz + 12,242,730,346 instructions # 2.07 insn per cycle + 2.086429072 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.179196e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.881876e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.881876e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.404473 sec - 4,673,986,503 cycles # 3.322 GHz - 8,684,795,820 instructions # 1.86 insn per cycle - 1.407876338 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.899734e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.116034e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.116034e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.794263 sec + 5,618,790,292 cycles # 2.007 GHz + 8,743,459,975 instructions # 1.56 insn per cycle + 2.808786612 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d6ad8dae6d..c4c4bff630 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:11:34 +DATE: 2024-03-01_02:29:46 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.473707e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045050e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.061478e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.463329 sec + 2,069,832,304 cycles # 3.002 GHz + 2,918,096,235 instructions # 1.41 insn per cycle + 0.772559551 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.045387e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.319438e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336268e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.608947 sec + 2,562,374,732 cycles # 3.012 GHz + 3,879,371,783 instructions # 1.51 insn per cycle + 0.910123971 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.413122e+00 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.210168e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.225481e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.225481e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.129663 sec - 17,942,023,952 cycles # 3.497 GHz - 59,485,012,848 instructions # 3.32 insn per cycle - 5.132002080 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1439) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.585844e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.598254e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.598254e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.359535 sec + 19,687,428,773 cycles # 3.094 GHz + 59,604,296,849 instructions # 3.03 insn per cycle + 6.365859123 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.227836e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.283842e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.283842e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.655716 sec - 9,284,099,234 cycles # 3.494 GHz - 30,662,650,809 instructions # 3.30 insn per cycle - 2.658111464 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.691737e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.735631e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.735631e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.515479 sec + 10,373,655,779 cycles # 2.948 GHz + 30,676,465,519 instructions # 2.96 insn per cycle + 3.528584808 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.296898e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.319348e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.319348e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.287545 sec - 4,259,821,564 cycles # 3.305 GHz - 11,003,859,405 instructions # 2.58 insn per cycle - 1.289855021 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.754839e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.932602e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.932602e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.702212 sec + 4,885,421,396 cycles # 2.863 GHz + 11,020,224,832 instructions # 2.26 insn per cycle + 1.717667988 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.392823e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.418968e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.418968e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.201286 sec - 3,975,596,259 cycles # 3.305 GHz - 10,273,857,845 instructions # 2.58 insn per cycle - 1.203631834 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.095884e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.117707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.117707e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.517268 sec + 4,368,757,303 cycles # 2.872 GHz + 10,296,904,442 instructions # 2.36 insn per cycle + 1.532957385 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.911241e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.961026e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.961026e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.882693 sec - 2,920,749,975 cycles # 3.303 GHz - 5,817,985,024 instructions # 1.99 insn per cycle - 0.885121704 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.761348e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.875289e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.875289e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.135983 sec + 4,101,318,849 cycles # 1.917 GHz + 5,843,401,136 instructions # 1.42 insn per cycle + 2.151041040 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 105cbe8fdc..7a80a6327c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:30:08 +DATE: 2024-03-01_03:14:59 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.634181e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.802665e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.802665e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.494713 sec + 2,059,588,733 cycles # 2.926 GHz + 3,067,379,574 instructions # 1.49 insn per cycle + 0.764554853 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.715023e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.440232e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.440232e+06 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.824199 sec + 3,179,114,916 cycles # 2.965 GHz + 5,069,610,946 instructions # 1.59 insn per cycle + 1.133521853 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.413122e+00 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.215926e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.231294e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.231294e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.124663 sec - 17,917,725,694 cycles # 3.495 GHz - 59,493,244,220 instructions # 3.32 insn per cycle - 5.127126814 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1439) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.525402e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.537809e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.537809e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.518056 sec + 19,750,480,394 cycles # 3.028 GHz + 59,611,727,500 instructions # 3.02 insn per cycle + 6.522447301 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.222963e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.278776e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.278776e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.663023 sec - 9,311,853,268 cycles # 3.495 GHz - 30,712,911,054 instructions # 3.30 insn per cycle - 2.665569418 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.903232e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.949588e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.949588e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.370584 sec + 10,396,817,898 cycles # 3.081 GHz + 30,723,473,589 instructions # 2.96 insn per cycle + 3.375008450 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.294133e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.316415e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.316415e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.295128 sec - 4,286,401,261 cycles # 3.305 GHz - 11,055,645,863 instructions # 2.58 insn per cycle - 1.297609303 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.888216e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.006946e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.006946e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.685691 sec + 4,902,930,827 cycles # 2.902 GHz + 11,066,989,869 instructions # 2.26 insn per cycle + 1.690115997 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.391259e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.417196e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.417196e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.206857 sec - 3,995,174,773 cycles # 3.306 GHz - 10,322,661,719 instructions # 2.58 insn per cycle - 1.209297943 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.103682e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.126401e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.126401e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.513774 sec + 4,402,683,305 cycles # 2.901 GHz + 10,346,890,880 instructions # 2.35 insn per cycle + 1.518250177 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.907041e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.955766e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.955766e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.888605 sec - 2,942,234,200 cycles # 3.305 GHz - 5,854,009,429 instructions # 1.99 insn per cycle - 0.891034303 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.798042e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.913691e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.913691e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.132010 sec + 4,131,468,761 cycles # 1.935 GHz + 5,881,941,509 instructions # 1.42 insn per cycle + 2.136586909 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 6001d7f706..90bf6e6455 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:11:50 +DATE: 2024-03-01_02:30:15 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.404765e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.032804e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.048930e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.465265 sec + 2,029,896,808 cycles # 2.980 GHz + 2,854,741,238 instructions # 1.41 insn per cycle + 0.763772288 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.033730e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.306062e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.322624e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.607194 sec + 2,545,937,909 cycles # 2.996 GHz + 3,826,405,631 instructions # 1.50 insn per cycle + 0.909330494 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.413122e+00 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.239919e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.255416e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.255416e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.082609 sec - 17,776,981,365 cycles # 3.497 GHz - 58,797,467,293 instructions # 3.31 insn per cycle - 5.084873713 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1323) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.602792e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.615496e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.615496e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.317260 sec + 19,445,883,412 cycles # 3.076 GHz + 58,795,735,881 instructions # 3.02 insn per cycle + 6.323702590 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.233270e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.289870e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.289870e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.653124 sec - 9,279,704,528 cycles # 3.496 GHz - 30,333,640,947 instructions # 3.27 insn per cycle - 2.655395297 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.903926e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.950247e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.950247e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.363533 sec + 10,256,448,579 cycles # 3.046 GHz + 30,347,165,405 instructions # 2.96 insn per cycle + 3.377280590 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.249037e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.269785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269785e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.336298 sec - 4,419,712,116 cycles # 3.304 GHz - 11,468,590,102 instructions # 2.59 insn per cycle - 1.338622545 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.598787e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.768674e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.768674e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.728674 sec + 5,043,692,461 cycles # 2.911 GHz + 11,484,727,811 instructions # 2.28 insn per cycle + 1.738921569 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.321449e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.345024e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.345024e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.264700 sec - 4,183,889,364 cycles # 3.304 GHz - 10,818,738,539 instructions # 2.59 insn per cycle - 1.266987637 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.033952e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.054066e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054066e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.607009 sec + 4,642,681,786 cycles # 2.882 GHz + 10,842,961,046 instructions # 2.34 insn per cycle + 1.618440779 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.893042e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.941104e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.941104e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.890295 sec - 2,945,907,544 cycles # 3.303 GHz - 6,079,729,396 instructions # 2.06 insn per cycle - 0.892645568 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.765124e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.875111e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.875111e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.134046 sec + 4,109,311,958 cycles # 1.922 GHz + 6,106,472,133 instructions # 1.49 insn per cycle + 2.145705149 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index b334186e41..af4f474b65 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:12:06 +DATE: 2024-03-01_02:30:44 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.308616e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.230427e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.340211e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.445727 sec + 2,001,558,197 cycles # 3.000 GHz + 2,820,746,449 instructions # 1.41 insn per cycle + 0.736568143 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.061859e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.424190e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.524056e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.500107 sec + 2,158,124,631 cycles # 2.977 GHz + 3,092,829,809 instructions # 1.43 insn per cycle + 0.784432881 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.311464e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.328435e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.328435e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 4.972112 sec - 17,386,645,192 cycles # 3.496 GHz - 58,906,569,186 instructions # 3.39 insn per cycle - 4.974344840 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1027) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.674607e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.688116e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.688116e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.146873 sec + 19,061,096,774 cycles # 3.099 GHz + 58,958,014,215 instructions # 3.09 insn per cycle + 6.153306662 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 Avg ME (F77/C++) = 1.4129858051842916 Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.100537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.119548e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.119548e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724758e+02 +- 2.665339e+02 ) GeV^-2 -TOTAL : 1.511800 sec - 5,288,183,416 cycles # 3.495 GHz - 16,687,010,204 instructions # 3.16 insn per cycle - 1.514049543 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.781065e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.932207e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.932207e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.886682 sec + 5,850,782,122 cycles # 3.096 GHz + 16,695,269,066 instructions # 2.85 insn per cycle + 1.898716135 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412987e+00 Avg ME (F77/C++) = 1.4129865669244737 Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.462224e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.557798e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.557798e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.688318 sec - 2,281,074,327 cycles # 3.307 GHz - 5,967,081,727 instructions # 2.62 insn per cycle - 0.690552036 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.892145e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.960485e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.960485e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.886334 sec + 2,581,461,055 cycles # 2.900 GHz + 5,980,838,355 instructions # 2.32 insn per cycle + 0.901108038 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.669958e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.775682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.775682e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.637008 sec - 2,111,600,396 cycles # 3.307 GHz - 5,581,688,810 instructions # 2.64 insn per cycle - 0.639312316 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.036523e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.118274e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.118274e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.825324 sec + 2,349,134,788 cycles # 2.832 GHz + 5,603,128,082 instructions # 2.39 insn per cycle + 0.837493797 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.932137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.167092e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.167092e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743730e+02 +- 2.676609e+02 ) GeV^-2 -TOTAL : 0.440183 sec - 1,458,007,499 cycles # 3.301 GHz - 3,311,390,347 instructions # 2.27 insn per cycle - 0.442462707 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.468368e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.511305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.511305e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.138775 sec + 2,054,810,359 cycles # 1.798 GHz + 3,334,038,485 instructions # 1.62 insn per cycle + 1.149410848 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133164033579249 Relative difference = 2.85398258307829e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index dc7aef3b25..f62f4c8cdf 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:30:24 +DATE: 2024-03-01_03:15:29 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.995753e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.112595e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.112595e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 +TOTAL : 0.451281 sec + 1,977,131,537 cycles # 2.986 GHz + 2,910,150,577 instructions # 1.47 insn per cycle + 0.718929629 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.708417e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.567455e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.567455e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 +TOTAL : 0.637857 sec + 2,608,085,808 cycles # 2.999 GHz + 3,961,129,191 instructions # 1.52 insn per cycle + 0.928114705 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.313699e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.330593e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.330593e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 4.970690 sec - 17,398,259,542 cycles # 3.499 GHz - 58,910,916,539 instructions # 3.39 insn per cycle - 4.972903941 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1027) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.667614e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.681311e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.681311e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.166590 sec + 19,068,958,964 cycles # 3.091 GHz + 58,962,429,433 instructions # 3.09 insn per cycle + 6.170849448 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 Avg ME (F77/C++) = 1.4129858051842916 Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.100372e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.119384e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.119384e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724758e+02 +- 2.665339e+02 ) GeV^-2 -TOTAL : 1.515113 sec - 5,305,885,790 cycles # 3.498 GHz - 16,735,297,320 instructions # 3.15 insn per cycle - 1.517362548 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.742153e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.893438e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.893438e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.898339 sec + 5,876,062,473 cycles # 3.090 GHz + 16,741,995,731 instructions # 2.85 insn per cycle + 1.902713080 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412987e+00 Avg ME (F77/C++) = 1.4129865669244737 Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.343773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.429942e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.429942e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.724732 sec - 2,402,094,625 cycles # 3.307 GHz - 6,003,946,640 instructions # 2.50 insn per cycle - 0.727046765 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.880787e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949754e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949754e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.895765 sec + 2,600,620,319 cycles # 2.891 GHz + 6,016,590,564 instructions # 2.31 insn per cycle + 0.900189489 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.660234e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.764784e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.764784e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.642041 sec - 2,128,904,938 cycles # 3.308 GHz - 5,618,859,132 instructions # 2.64 insn per cycle - 0.644359266 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.084629e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.167676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.167676e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.810420 sec + 2,363,958,510 cycles # 2.904 GHz + 5,639,045,986 instructions # 2.39 insn per cycle + 0.814799834 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.928773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164302e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.164302e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743730e+02 +- 2.676609e+02 ) GeV^-2 -TOTAL : 0.443516 sec - 1,473,489,767 cycles # 3.311 GHz - 3,352,410,215 instructions # 2.28 insn per cycle - 0.445839527 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.603454e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.652417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.652417e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.048212 sec + 2,071,251,869 cycles # 1.970 GHz + 3,374,799,702 instructions # 1.63 insn per cycle + 1.052574627 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133164033579249 Relative difference = 2.85398258307829e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 15e9b2f227..b43a9401e8 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:12:19 +DATE: 2024-03-01_02:31:09 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.359219e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.312667e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.422625e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.446885 sec + 1,972,174,797 cycles # 2.962 GHz + 2,746,314,290 instructions # 1.39 insn per cycle + 0.738224654 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.060800e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.419962e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.520064e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.497273 sec + 2,176,246,033 cycles # 3.004 GHz + 3,133,180,341 instructions # 1.44 insn per cycle + 0.782102946 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.323721e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.340795e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.340795e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 4.954042 sec - 17,317,128,762 cycles # 3.495 GHz - 58,675,021,220 instructions # 3.39 insn per cycle - 4.956261990 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1024) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.676079e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.689805e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.689805e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.143350 sec + 18,995,848,931 cycles # 3.090 GHz + 58,700,265,502 instructions # 3.09 insn per cycle + 6.150073952 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 Avg ME (F77/C++) = 1.4129858051842916 Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.165888e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.187242e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.187242e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724758e+02 +- 2.665339e+02 ) GeV^-2 -TOTAL : 1.428429 sec - 4,993,014,379 cycles # 3.492 GHz - 16,503,883,029 instructions # 3.31 insn per cycle - 1.430662597 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.180884e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.346917e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.346917e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.804269 sec + 5,584,642,506 cycles # 3.088 GHz + 16,510,962,038 instructions # 2.96 insn per cycle + 1.819572816 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412987e+00 Avg ME (F77/C++) = 1.4129865669244737 Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.140223e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.212640e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.212640e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.788410 sec - 2,610,891,292 cycles # 3.306 GHz - 6,621,195,509 instructions # 2.54 insn per cycle - 0.790692475 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.634306e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.685973e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.685973e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 1.022630 sec + 2,975,513,176 cycles # 2.898 GHz + 6,634,498,276 instructions # 2.23 insn per cycle + 1.034400565 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.287783e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.364522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.364522e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.739428 sec - 2,449,362,633 cycles # 3.306 GHz - 6,234,567,338 instructions # 2.55 insn per cycle - 0.741705444 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.769784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.829611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.829611e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.945795 sec + 2,752,522,160 cycles # 2.898 GHz + 6,256,039,450 instructions # 2.27 insn per cycle + 0.961442115 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.476688e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.662734e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.662734e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743730e+02 +- 2.676609e+02 ) GeV^-2 -TOTAL : 0.494659 sec - 1,638,048,070 cycles # 3.302 GHz - 3,675,411,384 instructions # 2.24 insn per cycle - 0.496969912 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.392018e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.430701e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.430701e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.200320 sec + 2,230,572,619 cycles # 1.852 GHz + 3,698,329,997 instructions # 1.66 insn per cycle + 1.213663484 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2378) (512y: 29) (512z: 3963) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133164033579249 Relative difference = 2.85398258307829e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index d6556bb1be..568d6c4513 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:12:32 +DATE: 2024-03-01_02:31:34 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.426575e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.039569e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055629e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.463709 sec + 2,071,639,040 cycles # 3.004 GHz + 2,941,031,538 instructions # 1.42 insn per cycle + 0.764842159 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.035948e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309187e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325703e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.608855 sec + 2,552,084,280 cycles # 3.004 GHz + 3,794,047,088 instructions # 1.49 insn per cycle + 0.909216297 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.413122e+00 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.149728e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.164397e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.164397e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.227650 sec - 18,278,883,667 cycles # 3.496 GHz - 60,529,183,661 instructions # 3.31 insn per cycle - 5.230024763 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1404) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.546543e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.558753e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.558753e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.456566 sec + 20,000,355,725 cycles # 3.096 GHz + 60,532,425,335 instructions # 3.03 insn per cycle + 6.462989015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.312678e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.370148e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.370148e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.620058 sec - 9,163,068,711 cycles # 3.495 GHz - 30,372,553,579 instructions # 3.31 insn per cycle - 2.622366147 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.015629e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.062224e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.062224e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.288178 sec + 10,191,043,016 cycles # 3.096 GHz + 30,384,591,666 instructions # 2.98 insn per cycle + 3.302408299 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.306653e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.329616e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.329616e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.278379 sec - 4,229,813,296 cycles # 3.305 GHz - 10,963,305,485 instructions # 2.59 insn per cycle - 1.280804887 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.844182e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.002719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.002719e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.686926 sec + 4,874,678,301 cycles # 2.883 GHz + 10,979,160,826 instructions # 2.25 insn per cycle + 1.698730583 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.415509e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.442464e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.442464e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.182291 sec - 3,912,618,342 cycles # 3.305 GHz - 10,225,163,393 instructions # 2.61 insn per cycle - 1.184557374 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4279) (512y: 82) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.132241e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155783e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.155783e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.469271 sec + 4,278,421,569 cycles # 2.904 GHz + 10,248,685,624 instructions # 2.40 insn per cycle + 1.480280367 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.830365e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.874980e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.874980e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.920395 sec - 3,040,685,755 cycles # 3.298 GHz - 6,018,479,747 instructions # 1.98 insn per cycle - 0.922695458 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.587751e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.694540e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.694540e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.183850 sec + 4,204,822,902 cycles # 1.923 GHz + 6,044,506,630 instructions # 1.44 insn per cycle + 2.192719745 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213786174055 Relative difference = 4.3972324717191576e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 29df44b487..2001d2a062 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:12:48 +DATE: 2024-03-01_02:32:03 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.409979e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.033107e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.049247e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.461655 sec + 2,079,301,655 cycles # 3.013 GHz + 2,945,288,445 instructions # 1.42 insn per cycle + 0.761228896 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.037338e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.304237e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318241e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.603998 sec + 2,550,056,991 cycles # 3.016 GHz + 3,770,712,997 instructions # 1.48 insn per cycle + 0.905342631 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.413122e+00 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.185651e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.200819e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.200819e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.168749 sec - 18,081,178,947 cycles # 3.497 GHz - 59,877,727,481 instructions # 3.31 insn per cycle - 5.171016307 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1262) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.536387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.548597e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.548597e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.482109 sec + 19,897,203,281 cycles # 3.068 GHz + 59,934,079,759 instructions # 3.01 insn per cycle + 6.488470935 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.319458e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.377020e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.377020e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.617395 sec - 9,156,723,314 cycles # 3.496 GHz - 30,085,782,756 instructions # 3.29 insn per cycle - 2.619742420 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.079933e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.127366e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.127366e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.246582 sec + 10,068,513,741 cycles # 3.097 GHz + 30,097,905,174 instructions # 2.99 insn per cycle + 3.264343936 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.247849e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.268698e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.268698e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.337352 sec - 4,424,282,737 cycles # 3.304 GHz - 11,463,474,174 instructions # 2.59 insn per cycle - 1.339740203 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4717) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.599229e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.768469e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.768469e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.728964 sec + 5,016,079,762 cycles # 2.895 GHz + 11,483,054,886 instructions # 2.29 insn per cycle + 1.742427809 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.341509e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.365682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.365682e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.246200 sec - 4,123,785,215 cycles # 3.305 GHz - 10,787,122,633 instructions # 2.62 insn per cycle - 1.248520950 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.051243e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.071758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071758e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.580395 sec + 4,590,869,899 cycles # 2.898 GHz + 10,811,034,467 instructions # 2.35 insn per cycle + 1.596114627 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.838145e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.883823e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.883823e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.916566 sec - 3,029,597,137 cycles # 3.300 GHz - 6,246,692,057 instructions # 2.06 insn per cycle - 0.918942029 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.586932e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.694563e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.694563e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.184061 sec + 4,216,157,602 cycles # 1.927 GHz + 6,273,944,868 instructions # 1.49 insn per cycle + 2.195028764 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213786174055 Relative difference = 4.3972324717191576e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index a3ce2c07d0..c4f627d4b9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:13:03 +DATE: 2024-03-01_02:32:32 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.456101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.491439e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.526891 sec + 2,312,216,646 cycles # 3.007 GHz + 3,538,385,257 instructions # 1.53 insn per cycle + 0.841955777 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.122556e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.158071e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.159487e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.037875 sec + 10,086,152,870 cycles # 3.059 GHz + 22,511,661,776 instructions # 2.23 insn per cycle + 3.352868148 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.481783e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483017e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.483017e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.616014 sec - 23,134,926,646 cycles # 3.496 GHz - 78,769,826,090 instructions # 3.40 insn per cycle - 6.618301143 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.962967e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.963888e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.963888e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.365178 sec + 25,629,682,297 cycles # 3.063 GHz + 78,935,463,104 instructions # 3.08 insn per cycle + 8.371779038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.886296e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.890965e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.890965e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.363857 sec - 11,665,465,951 cycles # 3.466 GHz - 39,273,332,129 instructions # 3.37 insn per cycle - 3.366166984 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.775994e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.779313e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.779313e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.352554 sec + 12,920,825,541 cycles # 2.966 GHz + 39,280,019,197 instructions # 3.04 insn per cycle + 4.370436126 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.126568e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.128873e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.128873e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.462899 sec - 4,833,071,970 cycles # 3.300 GHz - 13,680,344,924 instructions # 2.83 insn per cycle - 1.465169066 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.587371e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.605210e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.605210e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.920439 sec + 5,577,220,412 cycles # 2.899 GHz + 13,686,699,383 instructions # 2.45 insn per cycle + 1.933532640 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.280240e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.283273e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.283273e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.288270 sec - 4,256,880,431 cycles # 3.300 GHz - 12,334,305,617 instructions # 2.90 insn per cycle - 1.290601255 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.660129e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.682450e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.682450e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.708010 sec + 4,898,677,790 cycles # 2.863 GHz + 12,341,670,637 instructions # 2.52 insn per cycle + 1.722166284 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.023870e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.031547e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.031547e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.817740 sec - 2,700,603,798 cycles # 3.296 GHz - 6,324,447,395 instructions # 2.34 insn per cycle - 0.820100253 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.531084e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.544719e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.544719e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.187284 sec + 4,109,191,778 cycles # 1.875 GHz + 6,335,550,253 instructions # 1.54 insn per cycle + 2.200752564 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 1d56a4470b..8d1778e673 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:30:51 +DATE: 2024-03-01_03:16:28 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.142985e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.469804e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.469804e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.511155 sec + 2,228,194,908 cycles # 3.016 GHz + 3,541,287,827 instructions # 1.59 insn per cycle + 0.799045956 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.621948e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.093950e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.093950e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.305480 sec + 10,998,775,521 cycles # 3.077 GHz + 24,493,841,360 instructions # 2.23 insn per cycle + 3.633710964 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.481958e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483200e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.483200e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.617892 sec - 23,143,239,754 cycles # 3.496 GHz - 78,775,014,566 instructions # 3.40 insn per cycle - 6.620381527 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.956691e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.957671e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.957671e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.395628 sec + 25,661,453,890 cycles # 3.059 GHz + 78,946,626,848 instructions # 3.08 insn per cycle + 8.400144517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.816064e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.820578e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.820578e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.415562 sec - 11,674,359,157 cycles # 3.416 GHz - 39,286,928,254 instructions # 3.37 insn per cycle - 3.418065939 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.779486e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.783121e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.783121e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.352704 sec + 12,939,532,043 cycles # 2.970 GHz + 39,292,271,047 instructions # 3.04 insn per cycle + 4.357352756 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.129360e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.131693e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.131693e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.461769 sec - 4,829,397,024 cycles # 3.300 GHz - 13,689,355,371 instructions # 2.83 insn per cycle - 1.464289743 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.560149e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.578951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.578951e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.929060 sec + 5,589,750,479 cycles # 2.892 GHz + 13,696,577,373 instructions # 2.45 insn per cycle + 1.933630865 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.278771e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281807e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281807e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.292785 sec - 4,271,370,436 cycles # 3.300 GHz - 12,345,376,003 instructions # 2.89 insn per cycle - 1.295310523 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.749338e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.772565e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.772565e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.695619 sec + 4,910,055,408 cycles # 2.889 GHz + 12,351,492,799 instructions # 2.52 insn per cycle + 1.700097015 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.021965e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.029524e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.029524e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.821349 sec - 2,711,016,004 cycles # 3.294 GHz - 6,335,916,291 instructions # 2.34 insn per cycle - 0.823888580 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.621116e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.636094e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.636094e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.165843 sec + 4,123,850,554 cycles # 1.901 GHz + 6,345,407,560 instructions # 1.54 insn per cycle + 2.170297070 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index dc8d1365ef..597fd5665a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:35:49 +DATE: 2024-03-01_03:28:08 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.502974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.532224e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534544e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.505991 sec + 2,242,092,583 cycles # 3.014 GHz + 3,466,791,908 instructions # 1.55 insn per cycle + 0.811853126 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.137461e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.171030e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.172456e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 3.124130 sec + 10,356,034,147 cycles # 3.069 GHz + 23,417,816,833 instructions # 2.26 insn per cycle + 3.433693053 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.481844e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483078e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.483078e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.957351e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.958278e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.958278e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.615618 sec - 23,132,699,978 cycles # 3.496 GHz - 78,769,037,054 instructions # 3.41 insn per cycle - 6.617829059 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.389537 sec + 25,646,805,438 cycles # 3.056 GHz + 78,935,262,340 instructions # 3.08 insn per cycle + 8.393631651 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.815456e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.819978e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.819978e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.762997e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.766514e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.766514e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.413181 sec - 11,662,452,031 cycles # 3.415 GHz - 39,273,293,393 instructions # 3.37 insn per cycle - 3.415398215 seconds time elapsed +TOTAL : 4.369422 sec + 12,916,153,129 cycles # 2.954 GHz + 39,278,867,860 instructions # 3.04 insn per cycle + 4.373667823 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.130382e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.132719e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.132719e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.528032e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.546362e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.546362e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.458091 sec - 4,816,775,693 cycles # 3.300 GHz - 13,680,360,383 instructions # 2.84 insn per cycle - 1.460366647 seconds time elapsed +TOTAL : 1.933878 sec + 5,580,678,683 cycles # 2.881 GHz + 13,684,529,284 instructions # 2.45 insn per cycle + 1.937965494 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.278483e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281512e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281512e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.723484e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.746463e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.746463e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.290121 sec - 4,262,527,735 cycles # 3.300 GHz - 12,334,714,107 instructions # 2.89 insn per cycle - 1.292329143 seconds time elapsed +TOTAL : 1.697628 sec + 4,903,453,092 cycles # 2.882 GHz + 12,338,806,795 instructions # 2.52 insn per cycle + 1.701856837 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.025121e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.032750e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.032750e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.314965e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.328200e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.328200e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.817158 sec - 2,696,851,480 cycles # 3.294 GHz - 6,324,641,682 instructions # 2.35 insn per cycle - 0.819432272 seconds time elapsed +TOTAL : 2.253262 sec + 4,111,107,725 cycles # 1.822 GHz + 6,332,329,650 instructions # 1.54 insn per cycle + 2.257544828 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 93293d216b..60e01cd2dd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,138 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:34:23 +DATE: 2024-03-01_03:24:50 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.510827e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.539312e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.541615e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.503655 sec + 2,239,000,994 cycles # 3.024 GHz + 3,553,306,239 instructions # 1.59 insn per cycle + 0.813367897 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.145153e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.179407e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.180837e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.061657 sec + 10,188,245,124 cycles # 3.074 GHz + 23,248,414,020 instructions # 2.28 insn per cycle + 3.370951944 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe: Aborted - 4,606,961 cycles # 3.257 GHz - 6,302,943 instructions # 1.37 insn per cycle - 0.037961845 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.938252e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.939166e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.939166e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.471636 sec + 25,650,928,170 cycles # 3.027 GHz + 78,935,761,644 instructions # 3.08 insn per cycle + 8.475777896 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe: Aborted - 4,632,503 cycles # 2.696 GHz - 6,332,916 instructions # 1.37 insn per cycle - 0.038722425 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.732481e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.735838e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.735838e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.403548 sec + 12,924,361,173 cycles # 2.933 GHz + 39,279,334,894 instructions # 3.04 insn per cycle + 4.407811208 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe: Aborted - 4,502,042 cycles # 2.684 GHz - 6,323,077 instructions # 1.40 insn per cycle - 0.038401230 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.485088e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.502714e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.502714e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.942219 sec + 5,571,920,631 cycles # 2.864 GHz + 13,685,480,241 instructions # 2.46 insn per cycle + 1.946449782 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe: Aborted - 4,898,492 cycles # 3.254 GHz - 6,342,668 instructions # 1.29 insn per cycle - 0.037656802 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.737761e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.761950e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.761950e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.693500 sec + 4,894,918,115 cycles # 2.884 GHz + 12,340,665,409 instructions # 2.52 insn per cycle + 1.697702233 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe: Aborted - 4,865,885 cycles # 3.245 GHz - 6,331,116 instructions # 1.30 insn per cycle - 0.038706032 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.532631e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.547658e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.547658e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.186925 sec + 4,105,530,431 cycles # 1.874 GHz + 6,333,977,995 instructions # 1.54 insn per cycle + 2.191453097 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index f7105fde21..de32359ede 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,183 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:33:23 +DATE: 2024-03-01_03:21:36 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.198300e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.499375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.501597e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.508517 sec + 2,246,531,629 cycles # 3.011 GHz + 3,559,465,442 instructions # 1.58 insn per cycle + 0.806328345 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.741268e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.175443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.176848e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.195111 sec + 10,565,694,760 cycles # 3.061 GHz + 24,272,327,456 instructions # 2.30 insn per cycle + 3.508790742 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.481824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483072e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.483072e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.615817 sec - 23,135,149,155 cycles # 3.496 GHz - 78,769,523,152 instructions # 3.40 insn per cycle - 6.618024105 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.950947e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.951893e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.951893e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.415718 sec + 25,630,796,247 cycles # 3.044 GHz + 78,935,144,677 instructions # 3.08 insn per cycle + 8.419920398 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.822215e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.826748e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.826748e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.408335 sec - 11,661,748,107 cycles # 3.420 GHz - 39,273,222,931 instructions # 3.37 insn per cycle - 3.410571683 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.749651e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.752979e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.752979e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.383944 sec + 12,941,364,841 cycles # 2.950 GHz + 39,279,009,350 instructions # 3.04 insn per cycle + 4.388336169 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.127496e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.129817e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.129817e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.461766 sec - 4,828,963,686 cycles # 3.300 GHz - 13,679,994,736 instructions # 2.83 insn per cycle - 1.464026212 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.444820e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.462277e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.462277e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.951803 sec + 5,576,482,664 cycles # 2.852 GHz + 13,685,505,947 instructions # 2.45 insn per cycle + 1.956019187 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.278160e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281173e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281173e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.290276 sec - 4,260,419,952 cycles # 3.298 GHz - 12,334,650,697 instructions # 2.90 insn per cycle - 1.292701283 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.751887e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.775334e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.775334e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.690955 sec + 4,892,330,509 cycles # 2.888 GHz + 12,340,572,549 instructions # 2.52 insn per cycle + 1.695111197 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.023476e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.031069e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.031069e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.817764 sec - 2,700,756,236 cycles # 3.297 GHz - 6,324,382,140 instructions # 2.34 insn per cycle - 0.819998960 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.643060e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.657306e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.657306e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.155567 sec + 4,105,793,778 cycles # 1.902 GHz + 6,333,858,387 instructions # 1.54 insn per cycle + 2.159935327 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index cad62f799d..836b2fd223 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:13:22 +DATE: 2024-03-01_02:33:08 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.456815e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489621e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.492178e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.523446 sec + 2,259,779,898 cycles # 2.994 GHz + 3,514,783,609 instructions # 1.56 insn per cycle + 0.830655921 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.127813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.161921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.163304e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.027147 sec + 10,102,095,677 cycles # 3.066 GHz + 22,774,733,235 instructions # 2.25 insn per cycle + 3.352533111 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.503316e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.504578e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.504578e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.559386 sec - 22,937,995,811 cycles # 3.496 GHz - 78,503,071,109 instructions # 3.42 insn per cycle - 6.561632048 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4246) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.968945e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.969930e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.969930e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.342362 sec + 25,562,894,530 cycles # 3.064 GHz + 78,707,498,900 instructions # 3.08 insn per cycle + 8.350709191 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.721965e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.726341e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.726341e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.480511 sec - 11,746,156,846 cycles # 3.373 GHz - 39,219,964,596 instructions # 3.34 insn per cycle - 3.482689146 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.758058e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.761397e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.761397e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.374701 sec + 12,919,245,066 cycles # 2.951 GHz + 39,226,355,054 instructions # 3.04 insn per cycle + 4.387657418 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.134275e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.136627e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.136627e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.452965 sec - 4,799,994,769 cycles # 3.300 GHz - 13,794,367,131 instructions # 2.87 insn per cycle - 1.455293281 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.289947e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.307265e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.307265e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.987975 sec + 5,629,143,308 cycles # 2.825 GHz + 13,800,788,871 instructions # 2.45 insn per cycle + 1.999251955 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.263188e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.266183e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.266183e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.305376 sec - 4,313,294,929 cycles # 3.300 GHz - 12,459,449,195 instructions # 2.89 insn per cycle - 1.307767380 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.607973e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.629961e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.629961e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.716692 sec + 4,942,228,477 cycles # 2.873 GHz + 12,466,581,724 instructions # 2.52 insn per cycle + 1.728222884 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.017201e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.024979e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.024979e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.820329 sec - 2,710,241,576 cycles # 3.297 GHz - 6,448,202,008 instructions # 2.38 insn per cycle - 0.822690390 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.633414e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.646913e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.646913e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.159145 sec + 4,117,977,410 cycles # 1.904 GHz + 6,458,802,297 instructions # 1.57 insn per cycle + 2.172057894 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 70731b668b..5cb26f1dc5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:24:37 +DATE: 2024-03-01_03:05:40 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.234238e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.262824e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.264818e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.529504 sec + 2,311,611,520 cycles # 3.006 GHz + 3,548,053,349 instructions # 1.53 insn per cycle + 0.826491750 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.771596e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.800183e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.801376e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.298192 sec + 10,832,117,508 cycles # 3.051 GHz + 23,123,371,744 instructions # 2.13 insn per cycle + 3.609870208 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.040051e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.040564e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.040564e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 32.548023 sec - 113,830,166,208 cycles # 3.497 GHz - 143,883,071,750 instructions # 1.26 insn per cycle - 32.550633058 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21053) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.437828e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.438319e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.438319e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 36.966049 sec + 113,615,073,618 cycles # 3.074 GHz + 144,968,095,911 instructions # 1.28 insn per cycle + 36.970400514 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21301) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140450E-004 Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.954257e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.957301e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.957301e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.155080 sec - 14,533,662,034 cycles # 3.496 GHz - 37,569,403,247 instructions # 2.58 insn per cycle - 4.157544496 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.281454e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.284254e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.284254e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.007790 sec + 14,730,075,423 cycles # 2.939 GHz + 37,574,123,368 instructions # 2.55 insn per cycle + 5.012256986 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141209E-004 Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.045261e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.060223e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.060223e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.820401 sec - 6,012,541,840 cycles # 3.300 GHz - 13,057,135,841 instructions # 2.17 insn per cycle - 1.822679574 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.743950e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.758262e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.758262e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.127650 sec + 6,163,100,705 cycles # 2.892 GHz + 13,061,449,928 instructions # 2.12 insn per cycle + 2.132187716 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.099476e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.101705e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.101705e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.499045 sec - 4,952,169,341 cycles # 3.300 GHz - 11,434,777,878 instructions # 2.31 insn per cycle - 1.501339023 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.460039e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.482215e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.482215e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.743142 sec + 5,059,957,423 cycles # 2.897 GHz + 11,440,000,239 instructions # 2.26 insn per cycle + 1.747501406 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.995108e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.002501e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.002501e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.829410 sec - 2,740,060,865 cycles # 3.297 GHz - 5,933,015,109 instructions # 2.17 insn per cycle - 0.831788683 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.938377e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.953416e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.953416e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.075865 sec + 3,979,244,183 cycles # 1.914 GHz + 5,942,139,795 instructions # 1.49 insn per cycle + 2.080305520 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 37ef5ca493..afca4b7953 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:25:24 +DATE: 2024-03-01_03:06:48 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.244633e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.273686e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.275983e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.531287 sec + 2,311,991,159 cycles # 3.015 GHz + 3,584,221,599 instructions # 1.55 insn per cycle + 0.825938734 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.793538e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.821908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.823116e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.269849 sec + 10,805,743,512 cycles # 3.068 GHz + 25,084,175,459 instructions # 2.32 insn per cycle + 3.579404730 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.995413e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.995921e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.995921e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 32.838703 sec - 114,844,431,122 cycles # 3.497 GHz - 144,446,965,492 instructions # 1.26 insn per cycle - 32.841012912 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:22369) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.412070e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.412546e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.412546e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 37.253529 sec + 114,121,742,420 cycles # 3.069 GHz + 145,689,073,244 instructions # 1.28 insn per cycle + 37.257693750 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:22559) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140450E-004 Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.843711e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.846578e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.846578e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.274359 sec - 14,949,342,714 cycles # 3.496 GHz - 37,756,081,643 instructions # 2.53 insn per cycle - 4.276658422 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.198627e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.201180e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.201180e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.136766 sec + 15,152,451,249 cycles # 2.948 GHz + 37,761,291,325 instructions # 2.49 insn per cycle + 5.141156615 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141209E-004 Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.243220e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.258795e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.258795e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.781599 sec - 5,884,681,951 cycles # 3.300 GHz - 12,892,638,092 instructions # 2.19 insn per cycle - 1.783976623 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.950126e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.965335e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.965335e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.072422 sec + 6,013,210,013 cycles # 2.896 GHz + 12,895,807,400 instructions # 2.14 insn per cycle + 2.076740513 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.089187e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.091372e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.091372e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.513045 sec - 4,998,315,786 cycles # 3.300 GHz - 11,441,056,031 instructions # 2.29 insn per cycle - 1.515430361 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.394633e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.416357e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.416357e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.755119 sec + 5,091,337,522 cycles # 2.895 GHz + 11,446,622,503 instructions # 2.25 insn per cycle + 1.759562583 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.009625e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.017155e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.017155e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.823716 sec - 2,720,763,680 cycles # 3.298 GHz - 5,887,165,290 instructions # 2.16 insn per cycle - 0.826091686 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.001850e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.017431e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.017431e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.059473 sec + 3,944,538,203 cycles # 1.912 GHz + 5,896,184,476 instructions # 1.49 insn per cycle + 2.063940696 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index d61f6be5ab..082176c355 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:13:41 +DATE: 2024-03-01_02:33:45 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.331619e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.392833e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.401451e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.481440 sec + 2,077,514,231 cycles # 2.979 GHz + 3,093,505,744 instructions # 1.49 insn per cycle + 0.777796663 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.622317e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.697439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.700567e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.713365 sec + 5,944,272,538 cycles # 3.053 GHz + 12,632,277,461 instructions # 2.13 insn per cycle + 2.004079656 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.541023e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.542272e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.542272e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.472852 sec - 22,594,942,433 cycles # 3.496 GHz - 78,109,438,979 instructions # 3.46 insn per cycle - 6.475034460 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.049682e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.050694e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.050694e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.010109 sec + 24,614,432,061 cycles # 3.072 GHz + 78,126,558,251 instructions # 3.17 insn per cycle + 8.016891762 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.868488e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.888089e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.888089e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 1.680412 sec - 5,721,227,554 cycles # 3.426 GHz - 20,115,812,778 instructions # 3.52 insn per cycle - 1.682695212 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.386833e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.400650e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.400650e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.228676 sec + 6,461,822,382 cycles # 2.894 GHz + 20,120,855,558 instructions # 3.11 insn per cycle + 2.241648353 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.195033e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.204366e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.204366e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.753674 sec - 2,492,228,979 cycles # 3.300 GHz - 6,983,768,704 instructions # 2.80 insn per cycle - 0.755942549 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.671811e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.678370e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.678370e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.990019 sec + 2,821,251,649 cycles # 2.839 GHz + 6,989,221,748 instructions # 2.48 insn per cycle + 1.002444816 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.521876e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.534107e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.534107e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.656748 sec - 2,172,255,103 cycles # 3.300 GHz - 6,290,065,451 instructions # 2.90 insn per cycle - 0.658959491 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.922237e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.931217e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.931217e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.861179 sec + 2,488,986,957 cycles # 2.876 GHz + 6,296,476,670 instructions # 2.53 insn per cycle + 0.887481911 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.029368e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.061175e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.061175e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.413448 sec - 1,368,272,372 cycles # 3.298 GHz - 3,258,833,410 instructions # 2.38 insn per cycle - 0.415626098 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.534197e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.539839e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.539839e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.078476 sec + 2,048,809,794 cycles # 1.894 GHz + 3,266,667,713 instructions # 1.59 insn per cycle + 1.091634951 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 4bdc113ec5..6f564b583c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:31:10 +DATE: 2024-03-01_03:17:05 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.665443e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.315182e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.315182e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.468201 sec + 2,060,292,715 cycles # 2.983 GHz + 3,094,906,819 instructions # 1.50 insn per cycle + 0.750075013 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.249943e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.466015e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.466015e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.882218 sec + 6,478,461,444 cycles # 3.059 GHz + 12,879,929,349 instructions # 1.99 insn per cycle + 2.174649918 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.540494e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.541741e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.541741e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.463976 sec - 22,603,420,847 cycles # 3.496 GHz - 78,113,326,401 instructions # 3.46 insn per cycle - 6.466307048 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.041429e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.042536e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.042536e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.044775 sec + 24,623,818,516 cycles # 3.060 GHz + 78,132,484,739 instructions # 3.17 insn per cycle + 8.049291657 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.972987e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.992778e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.992778e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 1.652838 sec - 5,725,272,561 cycles # 3.461 GHz - 20,124,950,762 instructions # 3.52 insn per cycle - 1.655209946 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.498892e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.513186e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.513186e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.197009 sec + 6,464,288,620 cycles # 2.938 GHz + 20,129,426,624 instructions # 3.11 insn per cycle + 2.201352169 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.251048e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.260877e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.260877e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.736738 sec - 2,436,577,597 cycles # 3.300 GHz - 6,993,293,212 instructions # 2.87 insn per cycle - 0.739071334 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.703352e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.711063e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.711063e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.973161 sec + 2,827,392,405 cycles # 2.894 GHz + 6,998,075,079 instructions # 2.48 insn per cycle + 0.977561277 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.520306e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.532523e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.532523e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.658909 sec - 2,179,756,886 cycles # 3.300 GHz - 6,299,575,077 instructions # 2.89 insn per cycle - 0.661241811 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.931885e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.940835e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.940835e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.859317 sec + 2,491,742,914 cycles # 2.887 GHz + 6,305,390,293 instructions # 2.53 insn per cycle + 0.863665296 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.103354e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.136311e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.136311e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.408039 sec - 1,350,732,763 cycles # 3.299 GHz - 3,269,590,389 instructions # 2.42 insn per cycle - 0.410460189 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.551095e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.557002e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.557002e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.067932 sec + 2,057,227,059 cycles # 1.920 GHz + 3,276,345,738 instructions # 1.59 insn per cycle + 1.072312021 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index d77de8d4f6..66226e8d59 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:36:07 +DATE: 2024-03-01_03:28:45 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.308056e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.358553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.363626e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 +TOTAL : 0.461299 sec + 2,006,885,691 cycles # 2.992 GHz + 3,022,532,155 instructions # 1.51 insn per cycle + 0.728549346 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.572531e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.646089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.649338e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 +TOTAL : 1.795584 sec + 6,148,728,410 cycles # 3.042 GHz + 12,326,233,623 instructions # 2.00 insn per cycle + 2.078967785 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.539698e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.540976e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.540976e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.053824e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.054841e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.054841e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.464734 sec - 22,602,962,065 cycles # 3.496 GHz - 78,108,873,476 instructions # 3.46 insn per cycle - 6.466917782 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.994149 sec + 24,620,138,866 cycles # 3.079 GHz + 78,125,377,108 instructions # 3.17 insn per cycle + 7.998228624 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.925627e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.945236e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.945236e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.346279e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.360483e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.360483e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 1.658772 sec - 5,725,250,336 cycles # 3.449 GHz - 20,115,356,658 instructions # 3.51 insn per cycle - 1.660966677 seconds time elapsed +TOTAL : 2.242069 sec + 6,461,640,731 cycles # 2.878 GHz + 20,121,052,869 instructions # 3.11 insn per cycle + 2.246196034 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.251477e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.261264e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.261264e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.685316e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.692321e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.692321e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.734787 sec - 2,429,802,465 cycles # 3.300 GHz - 6,983,714,880 instructions # 2.87 insn per cycle - 0.736976159 seconds time elapsed +TOTAL : 0.982986 sec + 2,822,415,829 cycles # 2.862 GHz + 6,987,486,660 instructions # 2.48 insn per cycle + 0.987025186 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.521461e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.533742e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.533742e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.936405e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.945906e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.945906e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.656838 sec - 2,172,470,830 cycles # 3.300 GHz - 6,289,863,881 instructions # 2.90 insn per cycle - 0.658982044 seconds time elapsed +TOTAL : 0.855808 sec + 2,484,894,865 cycles # 2.892 GHz + 6,291,816,709 instructions # 2.53 insn per cycle + 0.859867773 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.110621e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.143899e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.143899e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.547512e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.553394e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.553394e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.405468 sec - 1,341,887,518 cycles # 3.298 GHz - 3,258,430,365 instructions # 2.43 insn per cycle - 0.407620788 seconds time elapsed +TOTAL : 1.069890 sec + 2,051,026,977 cycles # 1.912 GHz + 3,263,937,559 instructions # 1.59 insn per cycle + 1.073863100 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 06ac489be0..e810053300 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,138 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:34:29 +DATE: 2024-03-01_03:25:26 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.337764e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.388253e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.393743e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.460965 sec + 2,014,485,763 cycles # 3.003 GHz + 3,009,625,577 instructions # 1.49 insn per cycle + 0.728425666 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.558734e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.632343e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.635567e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.743753 sec + 6,041,672,737 cycles # 3.067 GHz + 12,221,124,809 instructions # 2.02 insn per cycle + 2.027112098 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe: Aborted - 4,588,745 cycles # 3.254 GHz - 6,297,711 instructions # 1.37 insn per cycle - 0.038493069 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.040104e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.041097e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.041097e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.046597 sec + 24,613,022,395 cycles # 3.060 GHz + 78,130,326,722 instructions # 3.17 insn per cycle + 8.050808561 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe: Aborted - 4,649,873 cycles # 3.255 GHz - 6,330,030 instructions # 1.36 insn per cycle - 0.038267941 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.468090e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.482424e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.482424e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.203809 sec + 6,456,229,713 cycles # 2.925 GHz + 20,119,923,968 instructions # 3.12 insn per cycle + 2.207913022 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe: Aborted - 4,641,444 cycles # 3.257 GHz - 6,326,220 instructions # 1.36 insn per cycle - 0.037581414 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.712278e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.719631e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.719631e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.965705 sec + 2,817,996,939 cycles # 2.908 GHz + 6,988,025,639 instructions # 2.48 insn per cycle + 0.969794950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe: Aborted - 4,553,782 cycles # 3.259 GHz - 6,327,532 instructions # 1.39 insn per cycle - 0.038327048 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.924856e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934354e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934354e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.859873 sec + 2,483,822,785 cycles # 2.877 GHz + 6,295,526,273 instructions # 2.53 insn per cycle + 0.863979329 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe: Aborted - 4,589,693 cycles # 3.232 GHz - 6,342,822 instructions # 1.38 insn per cycle - 0.039622291 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.552387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558368e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558368e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.064425 sec + 2,047,040,960 cycles # 1.917 GHz + 3,265,583,381 instructions # 1.60 insn per cycle + 1.068371519 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 597c41c4c4..29def3747b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,183 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:33:42 +DATE: 2024-03-01_03:22:13 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.727516e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.381665e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.387640e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.463988 sec + 2,009,660,419 cycles # 2.987 GHz + 3,043,780,102 instructions # 1.51 insn per cycle + 0.732052318 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.463642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.641012e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.644220e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.829361 sec + 6,179,090,687 cycles # 3.005 GHz + 13,497,023,724 instructions # 2.18 insn per cycle + 2.119489112 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.541255e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.542497e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.542497e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.460363 sec - 22,594,860,702 cycles # 3.497 GHz - 78,108,705,130 instructions # 3.46 insn per cycle - 6.462497444 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.033662e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.034665e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.034665e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.072340 sec + 24,646,233,583 cycles # 3.055 GHz + 78,130,465,005 instructions # 3.17 insn per cycle + 8.076398723 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.980894e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.000089e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.000089e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 1.649650 sec - 5,715,455,769 cycles # 3.462 GHz - 20,116,970,711 instructions # 3.52 insn per cycle - 1.651831111 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.437406e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.451013e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.451013e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.213064 sec + 6,463,144,308 cycles # 2.916 GHz + 20,121,040,605 instructions # 3.11 insn per cycle + 2.217197026 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.224266e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.233876e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.233876e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.743696 sec - 2,459,110,588 cycles # 3.300 GHz - 6,984,047,266 instructions # 2.84 insn per cycle - 0.745892046 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.690865e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.698060e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.698060e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.977816 sec + 2,816,932,981 cycles # 2.871 GHz + 6,987,870,279 instructions # 2.48 insn per cycle + 0.981891147 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.517139e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.529391e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.529391e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.657921 sec - 2,175,611,006 cycles # 3.300 GHz - 6,290,403,371 instructions # 2.89 insn per cycle - 0.660110597 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.925443e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934689e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934689e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.859893 sec + 2,483,713,955 cycles # 2.877 GHz + 6,295,351,555 instructions # 2.53 insn per cycle + 0.863911879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.107456e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.140872e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.140872e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.405676 sec - 1,342,942,249 cycles # 3.298 GHz - 3,258,636,901 instructions # 2.43 insn per cycle - 0.407877226 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.552325e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558086e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558086e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.064299 sec + 2,046,605,748 cycles # 1.917 GHz + 3,265,707,472 instructions # 1.60 insn per cycle + 1.068273671 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 2ad237dd1a..50b444080d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:13:56 +DATE: 2024-03-01_02:34:14 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.321381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.374979e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.380502e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.482038 sec + 2,083,496,491 cycles # 2.987 GHz + 3,090,021,729 instructions # 1.48 insn per cycle + 0.780369869 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.505248e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.577137e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.580211e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.719742 sec + 5,952,430,615 cycles # 3.047 GHz + 11,750,571,480 instructions # 1.97 insn per cycle + 2.009992190 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.559405e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.560665e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.560665e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.414841 sec - 22,433,654,429 cycles # 3.497 GHz - 77,753,050,419 instructions # 3.47 insn per cycle - 6.417041697 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3125) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.039243e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.040268e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.040268e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.050624 sec + 24,577,706,132 cycles # 3.054 GHz + 77,857,469,800 instructions # 3.17 insn per cycle + 8.057072902 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863364631370E-004 -Relative difference = 5.076783822441729e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274866268634797E-004 +Relative difference = 5.630135835748959e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.823157e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.842320e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.842320e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 1.676003 sec - 5,656,803,299 cycles # 3.372 GHz - 20,080,771,686 instructions # 3.55 insn per cycle - 1.678252022 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.236562e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.248995e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.248995e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.274363 sec + 6,415,212,085 cycles # 2.816 GHz + 20,086,390,532 instructions # 3.13 insn per cycle + 2.288238797 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861465384638E-004 Relative difference = 2.211071647257023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.186032e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.197298e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.197298e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.756528 sec - 2,501,574,254 cycles # 3.300 GHz - 7,125,249,610 instructions # 2.85 insn per cycle - 0.758780713 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.636656e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.643300e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.643300e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.010969 sec + 2,918,129,602 cycles # 2.878 GHz + 7,130,827,098 instructions # 2.44 insn per cycle + 1.024648825 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271939668077068E-004 Relative difference = 5.008498817890231e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.413858e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.425042e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.425042e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.685713 sec - 2,267,843,391 cycles # 3.300 GHz - 6,433,455,899 instructions # 2.84 insn per cycle - 0.687939686 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.848024e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.856123e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.856123e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.895519 sec + 2,583,274,132 cycles # 2.873 GHz + 6,439,451,842 instructions # 2.49 insn per cycle + 0.910176239 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271939668077068E-004 Relative difference = 5.008498817890231e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.907924e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.937842e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.937842e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.426031 sec - 1,409,552,540 cycles # 3.297 GHz - 3,420,395,875 instructions # 2.43 insn per cycle - 0.428204579 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.488982e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.494377e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.494377e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.109477 sec + 2,120,739,457 cycles # 1.905 GHz + 3,428,489,642 instructions # 1.62 insn per cycle + 1.120804955 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2911) (512y: 22) (512z: 9647) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952032322112E-004 Relative difference = 3.066639970473621e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 146b66385f..3e610d68fd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:26:12 +DATE: 2024-03-01_03:07:56 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.548079e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.594396e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.599390e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.487762 sec + 2,117,397,644 cycles # 2.979 GHz + 3,170,491,357 instructions # 1.50 insn per cycle + 0.771619877 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.728616e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.789567e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.792128e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.852993 sec + 6,403,206,858 cycles # 3.066 GHz + 13,984,822,985 instructions # 2.18 insn per cycle + 2.145838793 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.142872e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.143601e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.143601e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204932e-01 +- 3.252405e-01 ) GeV^-4 -TOTAL : 26.704464 sec - 93,392,949,985 cycles # 3.497 GHz - 135,135,739,845 instructions # 1.45 insn per cycle - 26.706712729 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15558) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.747654e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.748466e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.748466e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 28.541681 sec + 87,683,123,741 cycles # 3.072 GHz + 135,626,627,328 instructions # 1.55 insn per cycle + 28.545959109 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275340237237357E-004 -Relative difference = 3.579572077573998e-09 +Avg ME (F77/C++) = 6.6275340277317796E-004 +Relative difference = 4.184328521943034e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.661989e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.676894e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.676894e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 1.900144 sec - 6,560,267,417 cycles # 3.450 GHz - 19,382,075,169 instructions # 2.95 insn per cycle - 1.902414328 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.148984e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.161699e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.161699e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.302428 sec + 6,776,067,855 cycles # 2.939 GHz + 19,386,467,667 instructions # 2.86 insn per cycle + 2.306810458 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274862707273868E-004 Relative difference = 4.0849182767952624e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.751056e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.757067e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.757067e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 0.943073 sec - 3,117,335,618 cycles # 3.300 GHz - 6,803,703,724 instructions # 2.18 insn per cycle - 0.945302019 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.506728e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.512574e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.512574e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.096393 sec + 3,175,310,502 cycles # 2.890 GHz + 6,807,675,147 instructions # 2.14 insn per cycle + 1.100557110 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731558747466E-004 Relative difference = 2.3520194007978538e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.112187e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.120841e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.120841e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 0.782995 sec - 2,589,201,691 cycles # 3.300 GHz - 5,981,113,539 instructions # 2.31 insn per cycle - 0.785295392 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.815661e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.823746e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.823746e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.911313 sec + 2,641,911,907 cycles # 2.888 GHz + 5,985,989,672 instructions # 2.27 insn per cycle + 0.915610697 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731558747466E-004 Relative difference = 2.3520194007978538e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.297217e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.318515e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.318515e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211848e-01 +- 3.254639e-01 ) GeV^-4 -TOTAL : 0.503966 sec - 1,665,820,543 cycles # 3.296 GHz - 3,493,880,654 instructions # 2.10 insn per cycle - 0.506230823 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.523255e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.528884e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.528884e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.084772 sec + 2,074,111,548 cycles # 1.906 GHz + 3,500,542,355 instructions # 1.69 insn per cycle + 1.089027435 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5197) (512y: 3) (512z:44822) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 Avg ME (F77/C++) = 6.6272750363879224E-004 Relative difference = 5.490631193034436e-09 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index dd21065806..f668536073 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:26:49 +DATE: 2024-03-01_03:08:48 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.541557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.588429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.593399e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.485011 sec + 2,123,544,393 cycles # 3.007 GHz + 3,219,525,664 instructions # 1.52 insn per cycle + 0.766064420 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.637487e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.696462e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.698981e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.858325 sec + 6,401,876,626 cycles # 3.056 GHz + 13,834,352,039 instructions # 2.16 insn per cycle + 2.151127842 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.387508e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.388300e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.388300e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204932e-01 +- 3.252405e-01 ) GeV^-4 -TOTAL : 25.683491 sec - 89,826,358,356 cycles # 3.497 GHz - 135,208,960,358 instructions # 1.51 insn per cycle - 25.685677569 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15804) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.762616e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.763465e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.763465e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 28.469746 sec + 87,566,965,728 cycles # 3.076 GHz + 135,909,521,186 instructions # 1.55 insn per cycle + 28.473960910 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15910) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275352716470975E-004 -Relative difference = 4.098765184605283e-08 +Avg ME (F77/C++) = 6.6275352674967369E-004 +Relative difference = 4.0361421941458736e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.437796e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.452057e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.452057e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 1.950080 sec - 6,654,162,703 cycles # 3.410 GHz - 19,433,960,536 instructions # 2.92 insn per cycle - 1.952366253 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.141246e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.153468e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.153468e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.304055 sec + 6,854,008,563 cycles # 2.972 GHz + 19,438,508,034 instructions # 2.84 insn per cycle + 2.308246423 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274862764021530E-004 Relative difference = 4.170542995014107e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.786843e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.793023e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.793023e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 0.924113 sec - 3,054,845,902 cycles # 3.300 GHz - 6,714,445,014 instructions # 2.20 insn per cycle - 0.926394819 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.543089e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.548736e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.548736e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.070827 sec + 3,111,432,280 cycles # 2.896 GHz + 6,718,585,544 instructions # 2.16 insn per cycle + 1.075017514 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731651051409E-004 Relative difference = 2.4912983202981302e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.132201e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.140940e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.140940e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 0.775532 sec - 2,564,183,121 cycles # 3.300 GHz - 5,963,951,119 instructions # 2.33 insn per cycle - 0.777744409 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.837542e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.845711e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.845711e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.900474 sec + 2,630,752,588 cycles # 2.910 GHz + 5,969,340,561 instructions # 2.27 insn per cycle + 0.904647261 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731651051409E-004 Relative difference = 2.4912983202981302e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.326443e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.348069e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.348069e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211848e-01 +- 3.254639e-01 ) GeV^-4 -TOTAL : 0.499522 sec - 1,651,245,664 cycles # 3.296 GHz - 3,487,405,377 instructions # 2.11 insn per cycle - 0.501723648 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.526039e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.531935e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.531935e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.083027 sec + 2,083,719,160 cycles # 1.918 GHz + 3,494,111,175 instructions # 1.68 insn per cycle + 1.087325959 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4161) (512y: 4) (512z:44465) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 Avg ME (F77/C++) = 6.6272750384530066E-004 Relative difference = 5.80223501432476e-09 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index f9ab256fce..8553820a52 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:14:11 +DATE: 2024-03-01_02:34:44 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.473478e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.502235e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.504525e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.522907 sec + 2,248,416,129 cycles # 2.981 GHz + 3,483,881,112 instructions # 1.55 insn per cycle + 0.829467781 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.123898e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.157734e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.159130e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.035491 sec + 10,039,386,860 cycles # 3.052 GHz + 22,522,898,713 instructions # 2.24 insn per cycle + 3.349083086 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266732376103494E-004 +Relative difference = 2.659538381540814e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.460441e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.461666e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.461666e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.673146 sec - 23,335,676,255 cycles # 3.496 GHz - 79,277,751,932 instructions # 3.40 insn per cycle - 6.675453883 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4801) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.952639e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953615e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953615e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.409354 sec + 25,927,870,734 cycles # 3.082 GHz + 79,436,480,305 instructions # 3.06 insn per cycle + 8.416137774 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.686446e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.690711e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.690711e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.506769 sec - 11,577,360,778 cycles # 3.300 GHz - 38,543,055,165 instructions # 3.33 insn per cycle - 3.509025941 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.739028e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742372e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742372e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.395641 sec + 12,641,926,900 cycles # 2.873 GHz + 38,549,360,435 instructions # 3.05 insn per cycle + 4.411574958 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.172980e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.175482e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.175482e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.405389 sec - 4,642,988,163 cycles # 3.300 GHz - 13,474,033,843 instructions # 2.90 insn per cycle - 1.407688497 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.720558e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.737987e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.737987e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.889905 sec + 5,503,418,397 cycles # 2.905 GHz + 13,481,227,468 instructions # 2.45 insn per cycle + 1.901949052 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.284064e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.287100e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.287100e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.284493 sec - 4,244,001,962 cycles # 3.300 GHz - 12,130,560,495 instructions # 2.86 insn per cycle - 1.286759541 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.817789e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.841302e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.841302e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.679659 sec + 4,858,057,374 cycles # 2.885 GHz + 12,135,455,571 instructions # 2.50 insn per cycle + 1.694768152 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.007453e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.014936e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.014936e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.824364 sec - 2,723,570,767 cycles # 3.298 GHz - 6,306,786,489 instructions # 2.32 insn per cycle - 0.826694116 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.171224e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.183880e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.183880e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.297248 sec + 4,143,595,621 cycles # 1.801 GHz + 6,336,694,490 instructions # 1.53 insn per cycle + 2.312628428 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 0e2a0168d0..44d560fb63 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:14:29 +DATE: 2024-03-01_02:35:21 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.474402e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.502829e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505143e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.522485 sec + 2,266,664,443 cycles # 3.011 GHz + 3,552,942,464 instructions # 1.57 insn per cycle + 0.824080628 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.147340e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181695e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.182993e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.023944 sec + 10,029,910,184 cycles # 3.059 GHz + 21,497,951,661 instructions # 2.14 insn per cycle + 3.338904131 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266732376103494E-004 +Relative difference = 2.659538381540814e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.465973e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.467206e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.467206e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.658248 sec - 23,291,569,184 cycles # 3.498 GHz - 79,237,899,390 instructions # 3.40 insn per cycle - 6.660539391 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4465) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.924823e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.925747e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925747e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.531114 sec + 25,939,606,781 cycles # 3.040 GHz + 79,447,311,630 instructions # 3.06 insn per cycle + 8.537643841 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.697632e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.701963e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.701963e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.498479 sec - 11,550,565,484 cycles # 3.300 GHz - 38,513,812,102 instructions # 3.33 insn per cycle - 3.500917728 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.758654e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.761985e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.761985e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.372440 sec + 12,693,692,693 cycles # 2.901 GHz + 38,521,475,204 instructions # 3.03 insn per cycle + 4.385193423 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.152534e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154954e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.154954e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.430198 sec - 4,725,017,333 cycles # 3.300 GHz - 13,599,869,443 instructions # 2.88 insn per cycle - 1.432524334 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.635318e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.652109e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.652109e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.908191 sec + 5,531,901,200 cycles # 2.893 GHz + 13,605,961,475 instructions # 2.46 insn per cycle + 1.920337987 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.271343e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.274342e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274342e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.297199 sec - 4,286,083,032 cycles # 3.300 GHz - 12,263,686,047 instructions # 2.86 insn per cycle - 1.299546744 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.704499e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.725961e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.725961e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.699452 sec + 4,910,284,170 cycles # 2.883 GHz + 12,271,024,564 instructions # 2.50 insn per cycle + 1.712563313 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.010301e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.017802e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.017802e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.823170 sec - 2,718,854,468 cycles # 3.297 GHz - 6,413,166,633 instructions # 2.36 insn per cycle - 0.825525653 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.567240e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.580886e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.580886e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.177959 sec + 4,164,411,217 cycles # 1.910 GHz + 6,442,301,345 instructions # 1.55 insn per cycle + 2.190574077 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 3e8c727bbe..93119c7539 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:15:32 +DATE: 2024-03-01_02:37:42 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.065457e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.065836e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.065940e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.448496 sec + 8,082,390,398 cycles # 2.946 GHz + 16,852,562,382 instructions # 2.09 insn per cycle + 2.848455369 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.245006e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.247251e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.247453e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 4.002127 sec + 13,348,526,839 cycles # 3.088 GHz + 31,140,905,358 instructions # 2.33 insn per cycle + 4.382097820 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.872263e-03 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.007917e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.007943e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.007943e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.242575 sec - 18,324,214,896 cycles # 3.494 GHz - 53,656,205,350 instructions # 2.93 insn per cycle - 5.244757907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32534) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.053587e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.053836e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.053836e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.566168 sec + 18,831,689,747 cycles # 2.868 GHz + 53,916,332,004 instructions # 2.86 insn per cycle + 6.572689464 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.907110e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.907199e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.907199e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.771308 sec - 9,690,965,867 cycles # 3.495 GHz - 27,085,067,925 instructions # 2.79 insn per cycle - 2.773516868 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.663489e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.663581e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.663581e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.182674 sec + 9,806,871,766 cycles # 3.081 GHz + 27,093,022,297 instructions # 2.76 insn per cycle + 3.192772007 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.253444e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.253870e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.253870e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.243280 sec - 4,107,456,413 cycles # 3.300 GHz - 9,554,437,400 instructions # 2.33 insn per cycle - 1.245464164 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.630162e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.630605e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.630605e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.462430 sec + 4,231,767,010 cycles # 2.892 GHz + 9,562,001,834 instructions # 2.26 insn per cycle + 1.472832936 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.901436e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.902024e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.902024e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.079126 sec - 3,565,937,704 cycles # 3.300 GHz - 8,478,433,637 instructions # 2.38 insn per cycle - 1.081341215 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.135973e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.136556e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.136556e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.282131 sec + 3,734,243,960 cycles # 2.905 GHz + 8,486,594,514 instructions # 2.27 insn per cycle + 1.294140643 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.570541e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.572308e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.572308e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.618837 sec - 2,038,410,234 cycles # 3.287 GHz - 4,264,215,045 instructions # 2.09 insn per cycle - 0.620988332 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.702281e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.702851e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.702851e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.432645 sec + 2,701,519,987 cycles # 1.882 GHz + 4,274,080,381 instructions # 1.58 insn per cycle + 1.444722496 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 898ee4858d..7163808f45 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:31:25 +DATE: 2024-03-01_03:17:34 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.068445e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.069395e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.069395e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.373786 sec + 8,212,794,649 cycles # 3.050 GHz + 17,373,508,782 instructions # 2.12 insn per cycle + 2.749788140 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.191805e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.223957e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.223957e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.992060 sec + 13,207,906,873 cycles # 3.062 GHz + 30,525,969,027 instructions # 2.31 insn per cycle + 4.371813741 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.872263e-03 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.008044e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.008070e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.008070e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.240399 sec - 18,320,301,067 cycles # 3.495 GHz - 53,657,376,785 instructions # 2.93 insn per cycle - 5.242561648 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32534) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.148706e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.148931e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.148931e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.484661 sec + 18,737,465,302 cycles # 2.888 GHz + 53,915,906,594 instructions # 2.88 insn per cycle + 6.488680620 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.907377e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.907467e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.907467e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.770516 sec - 9,697,495,185 cycles # 3.498 GHz - 27,086,113,073 instructions # 2.79 insn per cycle - 2.772701601 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.664837e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.664944e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.664944e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.177972 sec + 9,794,551,146 cycles # 3.079 GHz + 27,093,049,280 instructions # 2.77 insn per cycle + 3.182112356 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.249825e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.250273e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.250273e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.244792 sec - 4,112,966,387 cycles # 3.300 GHz - 9,555,785,054 instructions # 2.32 insn per cycle - 1.246985323 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.541461e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.541883e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.541883e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.495047 sec + 4,300,282,840 cycles # 2.870 GHz + 9,561,701,370 instructions # 2.22 insn per cycle + 1.499121189 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.901276e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.901882e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.901882e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.079264 sec - 3,566,506,063 cycles # 3.300 GHz - 8,479,328,463 instructions # 2.38 insn per cycle - 1.081393203 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.118490e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.119048e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.119048e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.287264 sec + 3,730,461,014 cycles # 2.891 GHz + 8,485,603,542 instructions # 2.27 insn per cycle + 1.291227222 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.552403e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.554214e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.554214e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.620126 sec - 2,047,589,081 cycles # 3.294 GHz - 4,264,912,626 instructions # 2.08 insn per cycle - 0.622279806 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.742786e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.743427e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.743427e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.415968 sec + 2,690,639,160 cycles # 1.896 GHz + 4,273,336,878 instructions # 1.59 insn per cycle + 1.420067464 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 6a223c1182..fcaae9673e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:15:57 +DATE: 2024-03-01_02:38:46 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.066781e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.067205e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.067339e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.446944 sec + 8,408,759,874 cycles # 3.068 GHz + 18,673,492,162 instructions # 2.22 insn per cycle + 2.843675081 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.258123e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.260337e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.260588e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.986190 sec + 13,309,313,958 cycles # 3.084 GHz + 29,253,936,467 instructions # 2.20 insn per cycle + 4.370982628 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.872263e-03 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.978714e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.978965e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.978965e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.293741 sec - 18,509,726,309 cycles # 3.496 GHz - 53,668,966,356 instructions # 2.90 insn per cycle - 5.295890020 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32178) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.505940e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.506196e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.506196e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.219195 sec + 18,809,079,145 cycles # 3.025 GHz + 53,925,834,666 instructions # 2.87 insn per cycle + 6.232860023 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.900218e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.900307e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.900307e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.781026 sec - 9,726,233,315 cycles # 3.496 GHz - 27,082,128,443 instructions # 2.78 insn per cycle - 2.783220311 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.661174e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.661266e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.661266e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.189478 sec + 9,805,870,159 cycles # 3.076 GHz + 27,091,831,447 instructions # 2.76 insn per cycle + 3.203897537 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.269104e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.269539e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.269539e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.238661 sec - 4,092,373,675 cycles # 3.300 GHz - 9,554,221,469 instructions # 2.33 insn per cycle - 1.240854013 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.622791e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.623217e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.623217e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.464714 sec + 4,224,699,489 cycles # 2.882 GHz + 9,562,401,622 instructions # 2.26 insn per cycle + 1.476328883 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.901596e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.902173e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.902173e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.079028 sec - 3,565,293,300 cycles # 3.300 GHz - 8,478,411,533 instructions # 2.38 insn per cycle - 1.081182403 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.104704e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.105332e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.105332e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.294499 sec + 3,723,740,700 cycles # 2.874 GHz + 8,486,051,495 instructions # 2.28 insn per cycle + 1.308410916 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.548194e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.549944e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.549944e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.620448 sec - 2,042,307,043 cycles # 3.284 GHz - 4,267,534,527 instructions # 2.09 insn per cycle - 0.622543731 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.737812e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.738457e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.738457e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.421818 sec + 2,699,411,216 cycles # 1.899 GHz + 4,277,531,970 instructions # 1.58 insn per cycle + 1.435104148 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 4969158cd6..e89ab34326 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:16:21 +DATE: 2024-03-01_02:39:49 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.768224e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.769082e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.769342e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.706494 sec + 5,724,877,835 cycles # 2.946 GHz + 11,350,286,337 instructions # 1.98 insn per cycle + 2.064496697 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.316243e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.317022e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.317120e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.926202 sec + 6,794,636,243 cycles # 3.076 GHz + 13,931,883,029 instructions # 2.05 insn per cycle + 2.265774235 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.040188e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.040217e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.040217e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 5.079204 sec - 17,758,838,927 cycles # 3.495 GHz - 53,421,541,162 instructions # 3.01 insn per cycle - 5.081388085 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20403) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.967764e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.968029e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.968029e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.899633 sec + 18,012,008,843 cycles # 3.055 GHz + 53,588,806,253 instructions # 2.98 insn per cycle + 5.906269981 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087545108E-003 -Relative difference = 2.11977393295785e-08 +Avg ME (F77/C++) = 9.8479612087541066E-003 +Relative difference = 2.1197698286506752e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.216869e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.217343e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.217343e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924321e-03 +- 4.918774e-03 ) GeV^-6 -TOTAL : 1.254841 sec - 4,386,128,209 cycles # 3.492 GHz - 13,755,443,722 instructions # 3.14 insn per cycle - 1.257011070 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.554445e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.554907e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.554907e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.492504 sec + 4,596,969,768 cycles # 3.077 GHz + 13,763,413,131 instructions # 2.99 insn per cycle + 1.508036951 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896527003E-003 Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.433176e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.435080e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.435080e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.628179 sec - 2,077,257,351 cycles # 3.300 GHz - 4,810,210,256 instructions # 2.32 insn per cycle - 0.630303522 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.129307e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.130988e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.130988e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.749250 sec + 2,146,538,234 cycles # 2.864 GHz + 4,817,770,938 instructions # 2.24 insn per cycle + 0.763621351 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.775402e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.777862e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.777862e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.542106 sec - 1,793,648,403 cycles # 3.300 GHz - 4,267,307,526 instructions # 2.38 insn per cycle - 0.544249464 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.184924e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.187225e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.187225e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.652928 sec + 1,865,233,671 cycles # 2.849 GHz + 4,274,819,205 instructions # 2.29 insn per cycle + 0.666710238 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.719714e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.720497e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.720497e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946834e-03 +- 4.941266e-03 ) GeV^-6 -TOTAL : 0.309831 sec - 1,024,905,660 cycles # 3.295 GHz - 2,150,921,588 instructions # 2.10 insn per cycle - 0.312019958 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.469221e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.471533e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.471533e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.715424 sec + 1,360,172,621 cycles # 1.900 GHz + 2,159,744,323 instructions # 1.59 insn per cycle + 0.729957103 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982958280E-003 Relative difference = 2.0044092642523172e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 1862b6e1e8..684ca24c1f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:31:49 +DATE: 2024-03-01_03:18:37 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.798857e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.800593e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.800593e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 +TOTAL : 1.598425 sec + 5,724,594,753 cycles # 3.063 GHz + 12,186,790,592 instructions # 2.13 insn per cycle + 1.928350107 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.285950e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.298387e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.298387e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 +TOTAL : 1.887489 sec + 6,620,617,732 cycles # 3.045 GHz + 14,303,245,528 instructions # 2.16 insn per cycle + 2.231962749 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.041119e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.041148e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.041148e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 5.074721 sec - 17,746,301,498 cycles # 3.497 GHz - 53,422,248,318 instructions # 3.01 insn per cycle - 5.076826933 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20403) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.094412e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.094687e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.094687e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.812831 sec + 17,931,583,834 cycles # 3.083 GHz + 53,588,775,363 instructions # 2.99 insn per cycle + 5.816760256 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087545108E-003 -Relative difference = 2.11977393295785e-08 +Avg ME (F77/C++) = 9.8479612087541066E-003 +Relative difference = 2.1197698286506752e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.218426e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.218902e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.218902e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924321e-03 +- 4.918774e-03 ) GeV^-6 -TOTAL : 1.254452 sec - 4,388,322,961 cycles # 3.494 GHz - 13,756,441,763 instructions # 3.13 insn per cycle - 1.256579192 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.573130e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.573569e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.573569e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.483014 sec + 4,585,157,051 cycles # 3.085 GHz + 13,762,636,955 instructions # 3.00 insn per cycle + 1.487033664 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896527003E-003 Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.453965e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.455916e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.455916e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.626630 sec - 2,072,617,839 cycles # 3.300 GHz - 4,811,160,652 instructions # 2.32 insn per cycle - 0.628769204 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.234993e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.236702e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.236702e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.734407 sec + 2,124,324,714 cycles # 2.880 GHz + 4,817,114,861 instructions # 2.27 insn per cycle + 0.738469635 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.777854e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.780436e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.780436e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.541994 sec - 1,793,603,060 cycles # 3.300 GHz - 4,268,281,709 instructions # 2.38 insn per cycle - 0.544156652 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.746826e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.748881e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.748881e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.686036 sec + 1,868,608,359 cycles # 2.710 GHz + 4,274,464,507 instructions # 2.29 insn per cycle + 0.690085324 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.717939e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.718720e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.718720e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946834e-03 +- 4.941266e-03 ) GeV^-6 -TOTAL : 0.310080 sec - 1,025,575,344 cycles # 3.292 GHz - 2,151,895,701 instructions # 2.10 insn per cycle - 0.312200294 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.587479e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.589999e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.589999e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.701778 sec + 1,356,865,477 cycles # 1.924 GHz + 2,159,196,207 instructions # 1.59 insn per cycle + 0.705773287 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982958280E-003 Relative difference = 2.0044092642523172e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 0d3fb1c86c..2af18ad9d5 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:16:40 +DATE: 2024-03-01_02:40:36 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.765595e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.766455e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.766757e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.693781 sec + 5,858,518,501 cycles # 3.029 GHz + 12,487,165,720 instructions # 2.13 insn per cycle + 2.044833380 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.312075e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.312852e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.312969e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.933893 sec + 6,737,061,424 cycles # 3.047 GHz + 14,801,104,127 instructions # 2.20 insn per cycle + 2.267780802 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.032406e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.032434e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.032434e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 5.116727 sec - 17,886,024,738 cycles # 3.495 GHz - 53,426,547,569 instructions # 2.99 insn per cycle - 5.118914313 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20415) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.922433e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.922702e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.922702e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.925615 sec + 17,989,215,363 cycles # 3.036 GHz + 53,579,777,630 instructions # 2.98 insn per cycle + 5.931642569 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087257751E-003 -Relative difference = 2.119482139617284e-08 +Avg ME (F77/C++) = 9.8479612087582491E-003 +Relative difference = 2.1198118933954545e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.256894e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.257370e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.257370e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924321e-03 +- 4.918774e-03 ) GeV^-6 -TOTAL : 1.243013 sec - 4,349,759,961 cycles # 3.495 GHz - 13,748,864,713 instructions # 3.16 insn per cycle - 1.245159087 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.564689e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.565144e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.565144e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.490731 sec + 4,558,556,123 cycles # 3.055 GHz + 13,757,084,226 instructions # 3.02 insn per cycle + 1.501811120 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896225560E-003 Relative difference = 3.151694379513441e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.499112e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.501031e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.501031e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.623063 sec - 2,060,898,442 cycles # 3.300 GHz - 4,812,404,156 instructions # 2.34 insn per cycle - 0.625176375 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.177084e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.178836e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.178836e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.743943 sec + 2,139,817,263 cycles # 2.875 GHz + 4,819,936,629 instructions # 2.25 insn per cycle + 0.755587883 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070967E-003 Relative difference = 1.8588234562202478e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.724133e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.726548e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.726548e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.544853 sec - 1,802,924,271 cycles # 3.300 GHz - 4,269,301,578 instructions # 2.37 insn per cycle - 0.547035634 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.229829e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.232369e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.232369e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.647666 sec + 1,869,906,105 cycles # 2.875 GHz + 4,276,791,956 instructions # 2.29 insn per cycle + 0.664053491 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070967E-003 Relative difference = 1.8588234562202478e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.697678e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.698411e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.698411e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946834e-03 +- 4.941266e-03 ) GeV^-6 -TOTAL : 0.313529 sec - 1,032,832,833 cycles # 3.280 GHz - 2,157,258,957 instructions # 2.09 insn per cycle - 0.315697429 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.437378e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.439646e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.439646e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.718650 sec + 1,366,457,842 cycles # 1.901 GHz + 2,166,062,692 instructions # 1.59 insn per cycle + 0.731356674 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3474) (512y: 34) (512z:79492) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982955140E-003 Relative difference = 2.0044060904369713e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 8c752511ca..c639834643 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:17:00 +DATE: 2024-03-01_02:41:23 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.691286e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.691795e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.691928e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.198692 sec + 7,604,134,018 cycles # 3.054 GHz + 16,321,512,266 instructions # 2.15 insn per cycle + 2.594812497 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.112457e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112776e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112803e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.397194 sec + 11,475,121,938 cycles # 3.084 GHz + 26,000,925,285 instructions # 2.27 insn per cycle + 3.777191130 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.872263e-03 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.888048e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.888295e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.888295e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.342426 sec - 18,680,140,983 cycles # 3.496 GHz - 53,894,914,129 instructions # 2.89 insn per cycle - 5.344498912 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32196) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.034566e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.034790e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.034790e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.578920 sec + 19,096,747,933 cycles # 2.903 GHz + 54,154,360,803 instructions # 2.84 insn per cycle + 6.585797711 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.973916e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.974013e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.974013e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.677279 sec - 9,359,247,427 cycles # 3.494 GHz - 26,144,863,285 instructions # 2.79 insn per cycle - 2.679446649 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.634173e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.634271e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.634271e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.239396 sec + 9,369,032,238 cycles # 2.892 GHz + 26,160,172,444 instructions # 2.79 insn per cycle + 3.251135271 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.494886e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.495366e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.495366e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.176795 sec - 3,888,164,225 cycles # 3.300 GHz - 9,214,766,456 instructions # 2.37 insn per cycle - 1.178930781 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.697087e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.697545e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.697545e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.438333 sec + 4,079,178,507 cycles # 2.840 GHz + 9,228,646,226 instructions # 2.26 insn per cycle + 1.450605350 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.132578e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.133212e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.133212e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.030734 sec - 3,406,124,705 cycles # 3.300 GHz - 8,162,197,379 instructions # 2.40 insn per cycle - 1.032894212 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.363646e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.364393e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.364393e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.218747 sec + 3,509,445,956 cycles # 2.879 GHz + 8,176,263,750 instructions # 2.33 insn per cycle + 1.230057623 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.830953e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.832825e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.832825e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.600646 sec - 1,980,221,274 cycles # 3.289 GHz - 4,145,631,132 instructions # 2.09 insn per cycle - 0.602788892 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.850358e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.851005e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.851005e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.381042 sec + 2,620,845,167 cycles # 1.898 GHz + 4,155,618,865 instructions # 1.59 insn per cycle + 1.395419124 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 04dd6516f8..ace04f97d7 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:17:24 +DATE: 2024-03-01_02:42:25 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.691636e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.692217e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.692361e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.171682 sec + 7,616,890,265 cycles # 3.058 GHz + 16,356,089,453 instructions # 2.15 insn per cycle + 2.553555988 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.106871e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107188e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107217e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.406322 sec + 11,260,210,288 cycles # 3.017 GHz + 25,906,087,343 instructions # 2.30 insn per cycle + 3.788413520 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.872263e-03 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.833294e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.833543e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.833543e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.373246 sec - 18,782,693,778 cycles # 3.495 GHz - 53,895,950,177 instructions # 2.87 insn per cycle - 5.375414541 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32348) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.951672e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.951882e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.951882e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.644473 sec + 19,262,229,911 cycles # 2.898 GHz + 54,152,472,780 instructions # 2.81 insn per cycle + 6.648593616 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.984004e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.984101e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.984101e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.663925 sec - 9,315,048,345 cycles # 3.495 GHz - 26,065,146,110 instructions # 2.80 insn per cycle - 2.666137308 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.623003e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.623092e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623092e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.257928 sec + 9,349,757,536 cycles # 2.867 GHz + 26,077,919,393 instructions # 2.79 insn per cycle + 3.270643449 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.435411e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.435873e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.435873e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.192345 sec - 3,939,476,607 cycles # 3.300 GHz - 9,201,069,306 instructions # 2.34 insn per cycle - 1.194498730 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.760154e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.760626e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.760626e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.408906 sec + 4,059,558,991 cycles # 2.874 GHz + 9,213,876,384 instructions # 2.27 insn per cycle + 1.420092908 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.088546e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.089160e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.089160e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.039445 sec - 3,434,917,766 cycles # 3.300 GHz - 8,155,176,688 instructions # 2.37 insn per cycle - 1.041616665 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.304001e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.304638e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.304638e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.231479 sec + 3,558,951,872 cycles # 2.881 GHz + 8,168,148,330 instructions # 2.30 insn per cycle + 1.241837128 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.811569e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.813385e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.813385e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.601983 sec - 1,984,653,054 cycles # 3.289 GHz - 4,144,401,294 instructions # 2.09 insn per cycle - 0.604132820 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.836982e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.837574e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837574e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.381601 sec + 2,619,896,392 cycles # 1.892 GHz + 4,153,497,129 instructions # 1.59 insn per cycle + 1.390536918 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 2cec001807..4f705cbffa 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:14:48 +DATE: 2024-03-01_02:35:57 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.695225e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.365990e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.743234e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.446213 sec + 1,972,017,701 cycles # 2.992 GHz + 2,778,256,208 instructions # 1.41 insn per cycle + 0.734930275 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.267244e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.134450e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.554945e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.528224 sec + 2,304,762,750 cycles # 3.008 GHz + 3,294,040,641 instructions # 1.43 insn per cycle + 0.823439197 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.390238e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.419308e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.419308e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.204060 sec - 4,213,611,842 cycles # 3.495 GHz - 13,434,873,181 instructions # 3.19 insn per cycle - 1.206434287 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 864) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.091452e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.114280e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.114280e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.522856 sec + 4,703,604,569 cycles # 3.081 GHz + 13,462,460,024 instructions # 2.86 insn per cycle + 1.529442917 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499484 Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.533543e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.629343e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.629343e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.671073 sec - 2,349,549,529 cycles # 3.493 GHz - 7,542,124,614 instructions # 3.21 insn per cycle - 0.673394223 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.951069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.025448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.025448e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.861454 sec + 2,622,516,081 cycles # 3.029 GHz + 7,553,226,055 instructions # 2.88 insn per cycle + 0.875162721 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499478 Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.512664e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.794644e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.794644e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.387185 sec - 1,289,012,082 cycles # 3.315 GHz - 3,105,638,031 instructions # 2.41 insn per cycle - 0.389505293 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.378326e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598362e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.506903 sec + 1,479,878,074 cycles # 2.896 GHz + 3,120,545,502 instructions # 2.11 insn per cycle + 0.521612120 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.739873e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.054260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.054260e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.370616 sec - 1,234,624,013 cycles # 3.316 GHz - 2,961,729,590 instructions # 2.40 insn per cycle - 0.372993568 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.763846e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.033394e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.033394e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.456990 sec + 1,342,026,946 cycles # 2.909 GHz + 2,982,806,139 instructions # 2.22 insn per cycle + 0.473253864 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.362372e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.769155e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.769155e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.330642 sec - 1,102,117,372 cycles # 3.316 GHz - 1,932,728,355 instructions # 1.75 insn per cycle - 0.333111717 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.552530e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.674072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.674072e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.665523 sec + 1,326,336,546 cycles # 1.981 GHz + 1,954,248,677 instructions # 1.47 insn per cycle + 0.676015017 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index bb3a843a97..7838899130 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:30:37 +DATE: 2024-03-01_03:15:54 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.566228e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.132243e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.132243e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.471075 sec + 2,051,009,542 cycles # 3.009 GHz + 3,055,349,974 instructions # 1.49 insn per cycle + 0.738770181 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.288005e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.253544e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.253544e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.748132 sec + 3,046,262,026 cycles # 3.023 GHz + 4,636,082,832 instructions # 1.52 insn per cycle + 1.065675268 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.391136e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.420435e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.420435e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.207326 sec - 4,222,941,203 cycles # 3.492 GHz - 13,442,631,396 instructions # 3.18 insn per cycle - 1.209898020 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 864) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.089966e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112868e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112868e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.529900 sec + 4,728,814,715 cycles # 3.083 GHz + 13,467,526,764 instructions # 2.85 insn per cycle + 1.534252544 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499484 Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.522907e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.617976e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.617976e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.678477 sec - 2,374,544,352 cycles # 3.491 GHz - 7,592,390,633 instructions # 3.20 insn per cycle - 0.680996171 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.949285e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.024056e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.024056e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.869004 sec + 2,652,875,861 cycles # 3.039 GHz + 7,602,145,003 instructions # 2.87 insn per cycle + 0.873736497 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499478 Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.487640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.768436e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.768436e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.394135 sec - 1,312,471,214 cycles # 3.316 GHz - 3,157,137,064 instructions # 2.41 insn per cycle - 0.396629875 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.146841e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.351542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.351542e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.550316 sec + 1,514,222,662 cycles # 2.732 GHz + 3,170,467,422 instructions # 2.09 insn per cycle + 0.554802806 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.703184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.017690e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.017690e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.378206 sec - 1,260,283,982 cycles # 3.317 GHz - 3,011,528,674 instructions # 2.39 insn per cycle - 0.380880323 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.650572e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.918840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.918840e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.478096 sec + 1,374,122,120 cycles # 2.850 GHz + 3,032,631,270 instructions # 2.21 insn per cycle + 0.482825268 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.332880e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.727962e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.727962e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.336607 sec - 1,122,764,946 cycles # 3.319 GHz - 1,968,890,958 instructions # 1.75 insn per cycle - 0.338989281 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.537453e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.662993e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.662993e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.675099 sec + 1,354,490,621 cycles # 1.996 GHz + 1,991,409,834 instructions # 1.47 insn per cycle + 0.679620955 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 72f1443440..1de3a7df55 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:14:56 +DATE: 2024-03-01_02:36:15 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.634258e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.200936e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.553712e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.443315 sec + 2,012,981,464 cycles # 3.013 GHz + 2,802,025,362 instructions # 1.39 insn per cycle + 0.744859677 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.239420e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.026633e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.428795e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.526694 sec + 2,300,725,267 cycles # 3.007 GHz + 3,244,738,845 instructions # 1.41 insn per cycle + 0.822736768 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.382004e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.410796e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.410796e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.210873 sec - 4,235,605,729 cycles # 3.494 GHz - 13,439,420,817 instructions # 3.17 insn per cycle - 1.213116092 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 853) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.093034e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.115683e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.115683e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.520645 sec + 4,710,102,553 cycles # 3.090 GHz + 13,456,334,828 instructions # 2.86 insn per cycle + 1.527404362 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499484 Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.515723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.608916e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.608916e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.675572 sec - 2,364,775,142 cycles # 3.493 GHz - 7,541,520,259 instructions # 3.19 insn per cycle - 0.677865882 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.995699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.070809e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.070809e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.841713 sec + 2,618,818,041 cycles # 3.096 GHz + 7,552,217,415 instructions # 2.88 insn per cycle + 0.854217946 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499478 Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.519149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.803981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.803981e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.386496 sec - 1,286,321,868 cycles # 3.315 GHz - 3,104,502,696 instructions # 2.41 insn per cycle - 0.388829317 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.378534e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.594400e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.594400e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.506766 sec + 1,482,977,233 cycles # 2.909 GHz + 3,119,381,568 instructions # 2.10 insn per cycle + 0.519705447 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.751909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.067933e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.067933e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.369266 sec - 1,230,024,804 cycles # 3.316 GHz - 2,957,574,250 instructions # 2.40 insn per cycle - 0.371630014 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.757237e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.033602e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.033602e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.457488 sec + 1,337,095,985 cycles # 2.896 GHz + 2,979,946,273 instructions # 2.23 insn per cycle + 0.473330982 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.342815e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.741091e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.741091e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.331348 sec - 1,101,123,467 cycles # 3.308 GHz - 1,929,122,407 instructions # 1.75 insn per cycle - 0.333682550 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.547680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.672650e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.672650e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.666550 sec + 1,326,556,264 cycles # 1.978 GHz + 1,952,513,162 instructions # 1.47 insn per cycle + 0.681133765 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 1c0f8553bf..4d40239a82 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:15:03 +DATE: 2024-03-01_02:36:34 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.367019e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.211392e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.351303e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.439896 sec + 1,919,384,660 cycles # 2.928 GHz + 2,652,462,812 instructions # 1.38 insn per cycle + 0.728915663 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.249516e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.812359e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.959123e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.476459 sec + 2,111,535,021 cycles # 3.010 GHz + 2,984,192,787 instructions # 1.41 insn per cycle + 0.759063881 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.488352e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.522956e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.522956e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 -TOTAL : 1.124842 sec - 3,936,915,161 cycles # 3.495 GHz - 13,032,328,324 instructions # 3.31 insn per cycle - 1.127095177 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 748) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.158503e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184413e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.184413e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.434431 sec + 4,452,862,887 cycles # 3.097 GHz + 13,047,773,125 instructions # 2.93 insn per cycle + 1.440725517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246857540270419 Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.897446e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.147364e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.147364e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945527e+02 +- 1.186198e+02 ) GeV^-2 -TOTAL : 0.443424 sec - 1,554,112,621 cycles # 3.493 GHz - 4,506,739,300 instructions # 2.90 insn per cycle - 0.445630602 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.101216e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.298192e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.298192e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.547840 sec + 1,698,684,785 cycles # 3.077 GHz + 4,513,142,797 instructions # 2.66 insn per cycle + 0.560862800 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246859631675157 Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.891149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.978192e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.978192e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.230449 sec - 770,038,251 cycles # 3.320 GHz - 1,884,223,687 instructions # 2.45 insn per cycle - 0.232708341 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.089458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.856206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.856206e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.289099 sec + 853,788,001 cycles # 2.912 GHz + 1,897,231,072 instructions # 2.22 insn per cycle + 0.300313484 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.376104e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.503945e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.503945e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.218942 sec - 731,762,064 cycles # 3.320 GHz - 1,799,410,295 instructions # 2.46 insn per cycle - 0.221195439 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.510175e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.400201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.400201e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.271830 sec + 801,479,133 cycles # 2.904 GHz + 1,820,357,988 instructions # 2.27 insn per cycle + 0.285846070 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.006759e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.177946e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.177946e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.186117 sec - 620,244,286 cycles # 3.306 GHz - 1,284,140,386 instructions # 2.07 insn per cycle - 0.188364686 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.997156e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.506085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.506085e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.349567 sec + 731,841,700 cycles # 2.069 GHz + 1,305,336,291 instructions # 1.78 insn per cycle + 0.359850888 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489383243206 Relative difference = 4.32888033512879e-08 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index b407e68a76..441da29ffb 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:30:44 +DATE: 2024-03-01_03:16:12 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.711602e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.109045e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109045e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 +TOTAL : 0.448633 sec + 2,014,530,108 cycles # 3.024 GHz + 2,953,646,670 instructions # 1.47 insn per cycle + 0.724573840 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.194631e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.629307e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.629307e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 +TOTAL : 0.616658 sec + 2,563,348,424 cycles # 3.027 GHz + 3,871,269,369 instructions # 1.51 insn per cycle + 0.904047137 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.484109e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.518859e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.518859e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 -TOTAL : 1.130053 sec - 3,951,393,993 cycles # 3.492 GHz - 13,036,916,705 instructions # 3.30 insn per cycle - 1.132358928 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 748) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.161555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188116e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188116e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.433803 sec + 4,469,694,345 cycles # 3.110 GHz + 13,052,094,019 instructions # 2.92 insn per cycle + 1.437926738 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246857540270419 Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.884653e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.132509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.132509e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945527e+02 +- 1.186198e+02 ) GeV^-2 -TOTAL : 0.447778 sec - 1,569,690,605 cycles # 3.494 GHz - 4,555,017,000 instructions # 2.90 insn per cycle - 0.450120274 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.090515e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.286507e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.286507e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.554057 sec + 1,716,801,013 cycles # 3.079 GHz + 4,560,314,564 instructions # 2.66 insn per cycle + 0.558193661 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246859631675157 Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.847528e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.908555e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.908555e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.234405 sec - 783,805,869 cycles # 3.322 GHz - 1,921,295,110 instructions # 2.45 insn per cycle - 0.236707878 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.984424e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.738205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.738205e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.297621 sec + 872,015,724 cycles # 2.894 GHz + 1,933,356,220 instructions # 2.22 insn per cycle + 0.301984624 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.304436e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.414872e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.414872e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.223426 sec - 747,350,874 cycles # 3.322 GHz - 1,836,566,153 instructions # 2.46 insn per cycle - 0.225745389 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.471182e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.343667e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.343667e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.276934 sec + 818,470,682 cycles # 2.917 GHz + 1,856,220,484 instructions # 2.27 insn per cycle + 0.281151541 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.000891e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167702e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.167702e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.190037 sec - 635,927,755 cycles # 3.319 GHz - 1,325,913,533 instructions # 2.09 insn per cycle - 0.192251442 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.926101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.412906e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.412906e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.358667 sec + 751,185,964 cycles # 2.073 GHz + 1,346,032,296 instructions # 1.79 insn per cycle + 0.362975431 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489383243206 Relative difference = 4.32888033512879e-08 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 5b9052bbb4..8918bec5c8 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:15:10 +DATE: 2024-03-01_02:36:50 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.307953e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201255e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336658e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.436130 sec + 1,959,442,257 cycles # 3.009 GHz + 2,743,667,126 instructions # 1.40 insn per cycle + 0.720037686 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.165076e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.782519e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922757e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.476114 sec + 2,116,952,174 cycles # 3.025 GHz + 3,000,364,507 instructions # 1.42 insn per cycle + 0.758577490 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.490551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.525624e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.525624e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 -TOTAL : 1.123253 sec - 3,928,398,999 cycles # 3.493 GHz - 13,013,265,295 instructions # 3.31 insn per cycle - 1.125553354 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 732) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.155211e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.181167e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.438010 sec + 4,446,707,539 cycles # 3.084 GHz + 13,028,651,848 instructions # 2.93 insn per cycle + 1.444314220 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246857540270419 Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.877785e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.124762e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.124762e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945527e+02 +- 1.186198e+02 ) GeV^-2 -TOTAL : 0.445530 sec - 1,559,423,077 cycles # 3.489 GHz - 4,502,604,517 instructions # 2.89 insn per cycle - 0.447755001 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.098425e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.294299e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.294299e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.547784 sec + 1,696,823,876 cycles # 3.074 GHz + 4,509,092,353 instructions # 2.66 insn per cycle + 0.559046282 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246859631675157 Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.903710e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.988700e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.988700e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.230083 sec - 768,596,123 cycles # 3.320 GHz - 1,881,278,062 instructions # 2.45 insn per cycle - 0.232327253 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.019219e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.763141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.763141e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.292180 sec + 859,590,330 cycles # 2.901 GHz + 1,893,994,453 instructions # 2.20 insn per cycle + 0.304986924 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.360577e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.493697e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.493697e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.219203 sec - 732,515,894 cycles # 3.320 GHz - 1,795,625,953 instructions # 2.45 insn per cycle - 0.221411578 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.549494e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.438482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.438482e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.269638 sec + 798,515,936 cycles # 2.915 GHz + 1,816,168,831 instructions # 2.27 insn per cycle + 0.281600896 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.010951e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.181673e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.181673e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.185361 sec - 618,797,353 cycles # 3.311 GHz - 1,281,956,446 instructions # 2.07 insn per cycle - 0.187631017 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.914139e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.405725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.405725e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.355005 sec + 734,840,966 cycles # 2.046 GHz + 1,303,017,912 instructions # 1.77 insn per cycle + 0.365594980 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489383243206 Relative difference = 4.32888033512879e-08 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 0e5cce6fad..9473075c44 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:15:17 +DATE: 2024-03-01_02:37:07 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.657865e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.342545e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.715127e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.444064 sec + 2,011,501,510 cycles # 2.996 GHz + 2,813,725,950 instructions # 1.40 insn per cycle + 0.745188123 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.264913e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.129230e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.558122e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.531362 sec + 2,289,898,203 cycles # 2.976 GHz + 3,193,334,828 instructions # 1.39 insn per cycle + 0.827090728 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.380422e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.409222e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.409222e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.212271 sec - 4,238,626,091 cycles # 3.492 GHz - 13,408,625,650 instructions # 3.16 insn per cycle - 1.214366196 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 836) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.087550e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.110443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110443e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.528426 sec + 4,733,772,591 cycles # 3.090 GHz + 13,465,129,433 instructions # 2.84 insn per cycle + 1.534888113 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.553855e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.651544e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.651544e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.666771 sec - 2,329,497,090 cycles # 3.485 GHz - 7,376,535,398 instructions # 3.17 insn per cycle - 0.669166331 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.994397e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.071792e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.071792e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.843067 sec + 2,603,799,246 cycles # 3.073 GHz + 7,385,481,301 instructions # 2.84 insn per cycle + 0.853727039 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.590474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.884942e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.884942e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.381079 sec - 1,268,726,163 cycles # 3.316 GHz - 3,041,731,155 instructions # 2.40 insn per cycle - 0.383435781 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.410870e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.639370e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.639370e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.502006 sec + 1,465,753,503 cycles # 2.896 GHz + 3,056,435,528 instructions # 2.09 insn per cycle + 0.511483566 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.865460e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.199799e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.199799e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.361363 sec - 1,203,719,519 cycles # 3.317 GHz - 2,908,941,396 instructions # 2.42 insn per cycle - 0.363683085 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.873726e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.164501e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.164501e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.444397 sec + 1,302,869,174 cycles # 2.905 GHz + 2,931,108,724 instructions # 2.25 insn per cycle + 0.456529729 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.111885e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.475216e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.475216e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.345104 sec - 1,149,300,201 cycles # 3.315 GHz - 1,946,685,540 instructions # 1.69 insn per cycle - 0.347411360 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.488835e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.605728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.605728e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.681918 sec + 1,362,782,748 cycles # 1.986 GHz + 1,970,355,079 instructions # 1.45 insn per cycle + 0.693685126 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index b85d55e45a..f04f8628ac 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:15:25 +DATE: 2024-03-01_02:37:24 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.658641e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.216275e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.578681e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.445224 sec + 1,992,469,002 cycles # 2.992 GHz + 2,813,148,728 instructions # 1.41 insn per cycle + 0.736789901 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.263173e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.989199e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.385950e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.532147 sec + 2,297,521,664 cycles # 2.990 GHz + 3,210,517,070 instructions # 1.40 insn per cycle + 0.827894226 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.377531e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.405985e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.405985e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.214658 sec - 4,246,718,650 cycles # 3.492 GHz - 13,407,960,809 instructions # 3.16 insn per cycle - 1.216936825 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 826) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.091329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.113996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113996e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.523445 sec + 4,724,741,346 cycles # 3.094 GHz + 13,451,257,746 instructions # 2.85 insn per cycle + 1.529633779 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.553314e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.650058e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.650058e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.665859 sec - 2,332,786,191 cycles # 3.495 GHz - 7,378,270,783 instructions # 3.16 insn per cycle - 0.668166884 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.010329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.087455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.087455e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.835617 sec + 2,595,186,002 cycles # 3.089 GHz + 7,389,201,553 instructions # 2.85 insn per cycle + 0.854907608 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.588001e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.879616e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.879616e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.381270 sec - 1,269,081,565 cycles # 3.315 GHz - 3,041,458,855 instructions # 2.40 insn per cycle - 0.383572552 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.399802e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.624427e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.624427e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.503119 sec + 1,466,604,979 cycles # 2.890 GHz + 3,056,260,975 instructions # 2.08 insn per cycle + 0.515296062 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.863319e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.195493e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.195493e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.361431 sec - 1,203,784,149 cycles # 3.317 GHz - 2,909,476,469 instructions # 2.42 insn per cycle - 0.363729643 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.762321e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.040429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.040429e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.457389 sec + 1,310,592,019 cycles # 2.838 GHz + 2,931,897,706 instructions # 2.24 insn per cycle + 0.469608344 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.102649e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.463171e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.463171e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.345690 sec - 1,150,403,593 cycles # 3.313 GHz - 1,946,604,480 instructions # 1.69 insn per cycle - 0.347985872 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.462138e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.577756e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.577756e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.689340 sec + 1,364,202,689 cycles # 1.967 GHz + 1,970,285,028 instructions # 1.44 insn per cycle + 0.699058633 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 From 0fc6567bb1534f88c9d39672b28942a279f1f78f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 19:04:56 +0100 Subject: [PATCH 93/96] [susy2] in CODEGEN, add reference test file for susy_gg_tttt ./CODEGEN/generateAndCompare.sh susy_gg_tttt -c 'import model MSSM_SLHA2; generate g g > t t~ t t~' CUDACPP_RUNTEST_DUMPEVENTS=1 ./runTest.exe cp ../../test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttxttx.txt ../../../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/ --- ...ump_CPUTest.Sigma_MSSM_SLHA2_gg_ttxttx.txt | 4608 +++++++++++++++++ 1 file changed, 4608 insertions(+) create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttxttx.txt diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttxttx.txt b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttxttx.txt new file mode 100644 index 0000000000..1ab834d057 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_ttxttx.txt @@ -0,0 +1,4608 @@ +Event 0 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.045233209356228e+02 6.877986897204741e+01 -1.905381248013139e+02 2.818406336784427e+01 + 3 5.474933604313479e+02 -4.596225360107567e+02 3.030720946352406e+01 2.959350894402092e+02 + 4 5.014688717565998e+02 4.188441856206845e+02 2.572754903817052e+02 -9.924666020293013e+01 + 5 2.465144468764298e+02 -2.800151858197540e+01 -9.704457504391526e+01 -2.248724926051235e+02 + ME 4.847626218542335e-08 + +Event 1 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.008083996490003e+02 7.883320272753282e+01 -1.779144499629305e+02 -4.955590793941927e+01 + 3 5.095915127572658e+02 1.945100505575214e+01 4.232835353003264e+02 2.830834100052794e+02 + 4 5.330731110420373e+02 -3.351807593152421e+02 -3.437684940301842e+02 -2.316117933377167e+02 + 5 2.565269765516967e+02 2.368965515319572e+02 9.839940869278837e+01 -1.915708728143429e+00 + ME 1.196535026704345e-08 + +Event 2 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.874993553150720e+02 -3.353676580361334e+02 -1.808823320939806e+02 -7.046890271033260e+01 + 3 3.626455728935600e+02 2.056734411127569e+02 -1.625327830620692e+02 2.505859963969135e+02 + 4 5.314718876693759e+02 3.681440252065140e+01 3.745048277115114e+02 -3.753041448887399e+02 + 5 2.183831841219926e+02 9.287981440272507e+01 -3.108971255546151e+01 1.951870512021589e+02 + ME 8.937974432546909e-09 + +Event 3 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.366452034215279e+02 1.646792644528339e+02 4.079517915654560e+02 -4.601882122104464e+02 + 3 2.850611318540120e+02 -1.410564077018440e+02 2.241470575148822e+01 2.466992495047679e+02 + 4 5.100787565257925e+02 3.619333890962012e+01 -4.480337270250971e+02 2.411144116917793e+02 + 5 6.821490819866779e+01 -5.981619566060999e+01 1.766722970815293e+01 -2.762544898610076e+01 + ME 8.024921469528194e-07 + +Event 4 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.502637187052808e+02 5.534999170445882e+01 -2.382039909598248e+02 -5.317110126115480e+01 + 3 6.306127670141705e+02 9.222569406819551e+01 6.223439713329586e+02 -4.306813927289466e+01 + 4 3.012500630459575e+02 -1.662164166936456e+01 -2.533227164825486e+02 -1.621817586145584e+02 + 5 3.178734512345914e+02 -1.309540441032896e+02 -1.308172638905854e+02 2.584209991486080e+02 + ME 1.364278166669640e-08 + +Event 5 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.543595875307081e+02 1.177423500857260e+02 2.021351892259652e+02 -9.988445345330558e+01 + 3 1.672353472875774e+02 -4.643301969933536e+01 1.390571194439707e+02 -8.046585360227668e+01 + 4 4.148863440715186e+02 5.705637783937731e+01 -5.963382698230391e+01 -4.065944600266038e+02 + 5 6.635187211101953e+02 -1.283657082257679e+02 -2.815584816876320e+02 5.869447670821860e+02 + ME 2.814386504500931e-06 + +Event 6 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.600736528022405e+02 3.737322990662495e+02 2.536668813098300e+02 -8.743596493432509e+01 + 3 2.003131692770116e+02 1.425583968826074e+02 -7.419927099902715e+01 -1.195698015691029e+02 + 4 6.919589874187654e+02 -6.251541860981462e+02 -2.582872915825600e+02 1.458669218175010e+02 + 5 1.476541905019823e+02 1.088634901492892e+02 7.881968127175715e+01 6.113884468592695e+01 + ME 2.020866119480731e-07 + +Event 7 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.145629825968218e+02 1.959037730784687e+02 -1.819514480286373e+02 -3.168246341329453e+02 + 3 2.635019328107887e+02 -5.865118337121915e+01 -1.151155589842044e+02 -2.296556451819357e+02 + 4 5.280293449698501e+02 -2.518410835847975e+02 3.981394172424513e+01 4.623915092348714e+02 + 5 2.939057396225404e+02 1.145884938775480e+02 2.572530652885965e+02 8.408877008000950e+01 + ME 9.495188055645316e-07 + +Event 8 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.397378022805042e+02 -1.487288937969703e+01 -1.160009939420407e+02 6.289570909312076e+02 + 3 1.834286250976930e+02 1.635491119298312e+02 -6.096802297516525e+01 -5.639723988541733e+01 + 4 4.991547965706246e+02 -8.664012365698301e+01 3.208151079475728e+01 -4.905300975243927e+02 + 5 1.776787760511781e+02 -6.203609889315123e+01 1.448875061224487e+02 -8.202975352139767e+01 + ME 5.894678342728904e-06 + +Event 9 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.768693845261533e+02 -4.456674718007654e+02 1.636141522680621e+02 -4.489235630560712e+01 + 3 2.612786207802824e+02 8.184621745013867e+01 2.479930738231250e+02 8.194492156890167e+00 + 4 3.368713959814186e+02 3.479928867591731e+01 -2.732232778405869e+02 -1.939597571254547e+02 + 5 4.249805987121456e+02 3.290219656747095e+02 -1.383839482506003e+02 2.306576212741717e+02 + ME 8.058064791325644e-09 + +Event 10 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.488350171096896e+02 -1.831887653712691e+02 -5.102545779713487e+02 -8.545184554369904e+01 + 3 7.035065238815303e+02 4.041792774985642e+01 6.809284192215310e+02 1.721171349887872e+02 + 4 1.751828801954048e+02 1.268400499588020e+02 -1.091761136932150e+02 -5.178049284095648e+01 + 5 7.247557881337549e+01 1.593078766261062e+01 -6.149772755696730e+01 -3.488479660413173e+01 + ME 7.185595232667391e-06 + +Event 11 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.269658880944945e+02 -2.185431515300267e+02 5.206117424428266e+01 3.228063528112240e+01 + 3 1.005856529182378e+02 -4.644479827572636e+01 -7.508984239583351e+01 4.818578478227943e+01 + 4 4.675628274246624e+02 -2.703728833735589e+02 -2.515742362338792e+02 2.867471101666976e+02 + 5 7.048856315626052e+02 5.353608331793120e+02 2.746029043854301e+02 -3.672135302300991e+02 + ME 1.835840029457470e-05 + +Event 12 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.837700766590144e+02 1.296958544175531e+02 -7.540996840916677e+01 2.408687991801600e+02 + 3 2.628002397813192e+02 -8.376084481590415e+01 -1.663098735764636e+02 -1.854430178136279e+02 + 4 5.190623657100423e+02 -3.621783241410700e+02 1.519752457270120e+02 -3.393466158739262e+02 + 5 4.343673178496246e+02 3.162433145394211e+02 8.974459625861849e+01 2.839208345073941e+02 + ME 1.073098430327465e-06 + +Event 13 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.995598257201646e+02 -5.586644429775191e+02 1.101534889159592e+02 -1.877025137014794e+02 + 3 5.937134122752478e+02 5.346076236572911e+02 -1.031605262004246e+02 2.367450326344228e+02 + 4 1.647592826022760e+02 -7.925301245804906e+01 1.680176709576466e+01 -1.434652635392995e+02 + 5 1.419674794023117e+02 1.033098317782771e+02 -2.379472981129929e+01 9.442274460635609e+01 + ME 2.709032049584457e-08 + +Event 14 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.028779021236547e+02 1.164750571227171e+02 -2.225368001532150e+02 1.692511661426709e+02 + 3 5.030384766013232e+02 -4.515737850621548e+02 -1.020261550868048e+02 -1.967726842430677e+02 + 4 3.063971056395571e+02 2.430825837173774e+02 1.431329210018300e+01 -1.859708942464841e+02 + 5 3.876865156354653e+02 9.201614422206015e+01 3.102496631398368e+02 2.134924123468810e+02 + ME 3.142319252685384e-08 + +Event 15 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.241250378461693e+02 -1.805033431240864e+02 1.231079432807119e+02 -4.995007519281845e+01 + 3 3.844614260046683e+02 1.307728483313577e+02 -9.608556196200369e+01 -3.485349552158186e+02 + 4 4.110760119555223e+02 5.739270654525845e+01 -3.965776976049418e+02 9.173709502175156e+01 + 5 4.803375241936400e+02 -7.662211752529799e+00 3.695553162862337e+02 3.067479353868856e+02 + ME 1.096121714288426e-07 + +Event 16 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.001054877319223e+02 -1.430620362533657e+01 7.855291304039666e+01 -6.038113169479623e+01 + 3 4.093088791607984e+02 -1.119992820429624e+02 3.757454087335235e+02 -1.174959880055799e+02 + 4 7.087543950175984e+02 1.089443267851879e+02 -6.992154166158715e+02 3.951869538303891e+01 + 5 2.818312380896809e+02 1.736115888311106e+01 2.449170948419514e+02 1.383584243173372e+02 + ME 3.792567001414741e-06 + +Event 17 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.581631210613617e+02 -1.069713793286924e+02 -1.856462517995978e+02 -1.440166319333314e+02 + 3 4.806574857692273e+02 -3.871036291743681e+02 -8.927922635205843e+01 2.705764561967765e+02 + 4 4.638994968983048e+02 4.586966534295721e+02 -1.433077824478441e+01 -6.778460112673157e+01 + 5 2.972798962711065e+02 3.537835507348829e+01 2.892562563964408e+02 -5.877522313671356e+01 + ME 1.213163152961851e-08 + +Event 18 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.259069939880740e+02 6.694234146427497e+01 1.047108042274210e+02 -2.017279188511186e+01 + 3 4.274507013081183e+02 1.554527732937734e+02 -2.433805370458512e+02 -3.151419545400928e+02 + 4 3.505784003274900e+02 -2.628862848704467e+02 -8.012897055127391e+01 -2.176588249568330e+02 + 5 5.960639043763180e+02 4.049117011239836e+01 2.187987033697040e+02 5.529735713820376e+02 + ME 3.740013786737060e-07 + +Event 19 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.489050407799418e+02 -2.155159091310213e+02 -1.094654413984134e+02 -5.936269347447438e+01 + 3 2.600997784564500e+02 1.488424657388715e+02 1.507021889641061e+02 -1.509525269324567e+02 + 4 5.901199044754460e+02 3.182370400099771e+02 -2.814956777202155e+02 4.095447123924745e+02 + 5 4.008752762881623e+02 -2.515635966178273e+02 2.402589301545227e+02 -1.992294919855434e+02 + ME 2.237048841102538e-08 + +Event 20 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.908334513331024e+02 8.808928417538735e+01 2.991337054415237e+02 -5.018400855815297e+02 + 3 1.401706268696378e+02 -9.583159895522556e+01 -7.965314127488557e+01 6.418322494044027e+01 + 4 2.982536537656620e+02 1.120939357246564e+02 1.793074392675402e+02 2.103307723126631e+02 + 5 4.707422680315980e+02 -1.043516209448182e+02 -3.987880034341784e+02 2.273260883284264e+02 + ME 1.566047284234493e-07 + +Event 21 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.112413366700986e+02 2.716995006725958e+02 6.188924638009127e+01 3.024376621768124e+02 + 3 5.662415611998298e+02 -3.715892650272579e+02 2.050688206350350e+02 -3.748302316381091e+02 + 4 2.225653804612989e+02 -8.369496880497043e+01 -6.444146815856186e+01 1.959025215717929e+02 + 5 2.999517216687734e+02 1.835847331596327e+02 -2.025165988565645e+02 -1.235099521104963e+02 + ME 1.081315436624381e-08 + +Event 22 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.464950628793003e+02 1.637062818694534e+02 -3.617598874126116e+02 2.041760963268814e+02 + 3 4.190330476761974e+02 3.863780321676089e+02 1.621746205431057e+02 3.221099009750205e-01 + 4 6.041993534847934e+02 -5.473292470825340e+02 1.815948543536371e+02 -1.803076895525812e+02 + 5 3.027253595970839e+01 -2.755066954528332e+00 1.799041251586903e+01 -2.419051667527518e+01 + ME 9.438472547067266e-07 + +Event 23 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.340268758838264e+02 -9.885820280056214e+00 -1.281457206036256e+02 3.800195136498373e+01 + 3 1.460676733919085e+02 -1.207605538086376e+02 -7.761120113030877e+01 2.700287601473496e+01 + 4 5.791378930647284e+02 -2.550511254140081e+02 -3.011940406439271e+01 -5.190784566010291e+02 + 5 6.407675576595367e+02 3.856974995027019e+02 2.358763257983270e+02 4.540736292213105e+02 + ME 8.181588141380598e-07 + +Event 24 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.120994118474108e+02 -3.999484487477040e+02 -1.288251570508290e+02 4.450966268565518e+02 + 3 5.928222484883798e+02 4.242577382365044e+02 3.619492402961912e+01 -4.124724443205316e+02 + 4 1.136153476517903e+02 1.506354321291774e+01 -7.760365615106859e+01 -8.160397931410230e+01 + 5 1.814629920124192e+02 -3.937283270171844e+01 1.702338891722784e+02 4.897979677808214e+01 + ME 7.251319817061119e-07 + +Event 25 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.186484166765782e+02 2.332076299066257e+02 3.412653877230306e+02 6.647280106472095e+01 + 3 2.684644304712752e+02 7.897452755298160e+01 -5.206738710801172e+01 2.512472121751683e+02 + 4 5.018698799968637e+02 -3.665172118365429e+01 -2.251847766938088e+02 -4.470143668023451e+02 + 5 3.110172728552823e+02 -2.755304362759525e+02 -6.401322392121016e+01 1.292943535624560e+02 + ME 2.528188573171588e-08 + +Event 26 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.880157450735950e+02 -5.340609759213479e+02 -1.800693671113294e+02 1.676794963032682e+02 + 3 1.698935524482946e+02 9.424850332356016e+00 1.535211800024927e+02 -7.215426979989711e+01 + 4 3.316905961662700e+02 1.799651549092350e+02 -1.445567114258025e+02 -2.381901588760289e+02 + 5 4.104001063118405e+02 3.446709706797568e+02 1.711048985346393e+02 1.426649323726578e+02 + ME 4.977062546288587e-08 + +Event 27 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.260721948247595e+02 2.432868311106053e+02 -5.770296458017680e+01 3.449918852518222e+02 + 3 1.015574395166652e+02 -5.186955072305006e-01 -6.316674902066287e+01 7.952110596761376e+01 + 4 6.802443988998607e+02 -1.254022800835581e+02 1.115720726787113e+01 -6.684925034063277e+02 + 5 2.921259667587150e+02 -1.173658555198167e+02 1.097125063329686e+02 2.439795121868918e+02 + ME 7.006629557338743e-06 + +Event 28 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.577687888188215e+02 -4.408525315058970e+00 -6.157352186961905e+02 2.313233341145688e+02 + 3 4.217509083163126e+02 -2.657091410126099e+01 3.668529733823864e+02 -2.063654794532264e+02 + 4 2.768898842003990e+02 1.707159504683328e+02 2.175125744448076e+02 -1.457230889722247e+01 + 5 1.435904186644678e+02 -1.397365110520129e+02 3.136967086899622e+01 -1.038554576412061e+01 + ME 5.101405270373193e-08 + +Event 29 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.691757511791563e+02 7.936049457942596e+01 -4.578381871568865e+02 6.489985959561199e+01 + 3 4.168196707014678e+02 3.402072588652904e+02 1.817843664063917e+02 -1.579623468718794e+02 + 4 3.259562817662210e+02 -2.464919493866710e+02 1.791142676162755e+02 -1.157898771339883e+02 + 5 2.880482963531554e+02 -1.730758040580453e+02 9.693955313421934e+01 2.088523644102555e+02 + ME 1.317920713383499e-08 + +Event 30 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.587646316076485e+02 2.500214437920706e+02 6.280817637543875e+01 2.243981273456415e+01 + 3 1.696814728620085e+02 -9.470121197275458e+01 -1.406091650320611e+02 7.248820076297705e+00 + 4 5.864710833492852e+02 -2.932689801956460e+02 4.208355310058186e+02 2.843221635802258e+02 + 5 4.850828121810580e+02 1.379487483763299e+02 -3.430345423491963e+02 -3.140107963910876e+02 + ME 1.644566030583812e-08 + +Event 31 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.510064466785172e+02 -1.763911466206437e+02 1.762664757402212e+02 -2.864488083675499e+01 + 3 1.634676601057650e+02 -1.528515845674027e+02 -1.917706779920085e+01 5.468371847554744e+01 + 4 6.544233617606191e+02 5.127258688558928e+02 -1.829185292592408e+02 3.632119649428913e+02 + 5 4.311025314550986e+02 -1.834831376678465e+02 2.582912131822037e+01 -3.892508025816840e+02 + ME 1.451849837999406e-07 + +Event 32 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.730232054084494e+02 2.432605335138694e+02 1.715636199034402e+02 -2.248033561434742e+02 + 3 2.362205628011194e+02 -4.367709384711756e+01 7.672087287982941e+01 2.191035678114456e+02 + 4 4.054383297542171e+02 -2.205824977775274e+02 1.234567003875168e+02 3.169890282605115e+02 + 5 4.853179020362142e+02 2.099905811077545e+01 -3.717411931707865e+02 -3.112892399284830e+02 + ME 6.211190595364239e-06 + +Event 33 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.747910883673140e+02 5.323539820562517e+02 8.641425442309115e+01 1.987878509563276e+02 + 3 1.584581085429428e+02 -1.118080933695559e+02 1.076607709535321e+02 -3.189170452896510e+01 + 4 4.953900627949077e+02 -4.939048969940528e+02 2.681611008765800e+01 -2.738910880265213e+01 + 5 2.713607402948346e+02 7.335900830735730e+01 -2.208911354642812e+02 -1.395070376247103e+02 + ME 7.441884049439939e-08 + +Event 34 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.310199002368670e+02 -1.157285499049388e+02 -6.166178457504552e+02 -6.764206926102776e+01 + 3 4.817608951852552e+02 -3.997907691000603e+01 4.373161358964807e+02 1.981157005986480e+02 + 4 2.860765236279268e+02 1.674393784787844e+02 1.106820450243557e+02 -2.038463068463705e+02 + 5 1.011426809499510e+02 -1.173175166383970e+01 6.861966482961878e+01 7.337267550875023e+01 + ME 3.190174699287766e-08 + +Event 35 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.652617461959503e+02 4.544204650188410e+02 3.037117635510198e+01 -9.512163314500259e+01 + 3 2.394629598395643e+02 -1.805063362629200e+02 -1.540296017189007e+02 3.216913891261686e+01 + 4 3.655637126606654e+02 -1.682977929708780e+02 -3.352014704439823e+01 -3.227833338905571e+02 + 5 4.297115813038208e+02 -1.056163357850425e+02 1.571785724081969e+02 3.857358281229427e+02 + ME 2.444717400569602e-08 + +Event 36 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.499192045006146e+02 -3.325790315669839e+02 7.774434570915017e+01 -2.928724894056954e+02 + 3 2.295207781400161e+02 7.258784260898673e+01 9.894250089353624e+00 2.175152788163185e+02 + 4 4.830584025161939e+02 3.898488694796316e+02 1.934726951777061e+02 2.095986532104270e+02 + 5 3.375016148431750e+02 -1.298576805216344e+02 -2.811112909762098e+02 -1.342414426210501e+02 + ME 1.776637184526316e-07 + +Event 37 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.644647330879737e+02 4.258097602950140e+02 -2.901560281802423e+02 -4.195341569715645e+02 + 3 3.705058926630716e+00 -8.486098469472709e-01 5.377443871237386e-01 -3.566252087498443e+00 + 4 4.717811915619315e+02 -4.461616916387519e+02 8.922603813628871e+01 1.247235011701669e+02 + 5 3.600490164234641e+02 2.120054119068520e+01 2.003922456568298e+02 2.983769078888961e+02 + ME 1.020314071241294e-03 + +Event 38 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.320658071316811e+02 -5.229898031006178e+01 -7.533526820906630e+01 -3.191506137992071e+02 + 3 5.173086194760923e+02 2.914536896664749e+02 -2.771354642533741e+02 3.253596302818244e+02 + 4 1.139748976757842e+02 3.760489884644618e+01 -1.069643132300276e+02 1.160967605846843e+01 + 5 5.366506757164423e+02 -2.767596082028593e+02 4.594350456924680e+02 -1.781869254108562e+01 + ME 6.861905994973808e-07 + +Event 39 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.580369915970425e+02 9.859906083728804e+01 -2.937981802807959e+02 -1.793079524660254e+02 + 3 3.008057137260955e+02 -1.290389675776721e+02 -4.755272039058870e+01 -2.675289910275308e+02 + 4 1.563788184394284e+02 -8.740572256111450e+01 -9.464096594244898e+00 -1.293251924248945e+02 + 5 6.847784762374333e+02 1.178456293014986e+02 3.508149972656295e+02 5.761621359184508e+02 + ME 5.408399337549234e-04 + +Event 40 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.061905838966051e+02 -8.306017888269722e+01 -5.398586008655026e+01 -3.824883923718050e+01 + 3 3.775395017070493e+02 1.945243896057244e+02 3.099826314852028e+02 -9.277448666113081e+01 + 4 4.073176242310424e+02 -2.719044333335593e+02 2.932534467128883e+02 -7.731779959225281e+01 + 5 6.089522901653029e+02 1.604402226105320e+02 -5.492502181115408e+02 2.083411254905642e+02 + ME 3.071244096936900e-08 + +Event 41 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.139380311935496e+02 -4.604569970586491e+02 -8.651738563141195e+01 2.112496053258770e+02 + 3 3.779489592837219e+02 3.440308282843344e+02 -8.249656237966933e+01 1.329756451730190e+02 + 4 4.970454597046420e+02 1.909761039695885e+02 1.692120849403949e+02 -4.265554911564500e+02 + 5 1.110675498180860e+02 -7.454993519527386e+01 -1.981369293135298e-01 8.233024065755386e+01 + ME 1.319377503780746e-06 + +Event 42 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.173779439151573e+02 3.630558430598339e+02 -9.842883754866881e+00 2.056645825764505e+02 + 3 4.387025788534136e+02 -3.367242411299441e+02 -8.622807709247308e+00 2.810736297045815e+02 + 4 3.848566002764388e+02 -2.376531184093808e+01 2.151853038720266e+02 -3.181903482617680e+02 + 5 2.590628769549897e+02 -2.566290088951960e+00 -1.967196124079123e+02 -1.685478640192640e+02 + ME 5.297458392691530e-07 + +Event 43 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.285774523262414e+02 3.003415161273636e+02 -7.585251142778557e+01 -1.095651056569458e+02 + 3 6.443551136010930e+02 -4.262554242356605e+02 -3.793660967414875e+02 2.993011700222419e+02 + 4 1.194081536546638e+02 7.683021647299904e+01 7.417483351494982e+01 -5.341834019283430e+01 + 5 4.076592804180017e+02 4.908369163529784e+01 3.810437746543233e+02 -1.363177241724618e+02 + ME 2.384643523886275e-07 + +Event 44 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.344278629651232e+02 -3.630184907012124e+02 -2.117651291203814e+02 1.100030618577765e+02 + 3 3.783362210634240e+02 -2.139207812824498e+02 2.214425529500993e+02 2.198622096837037e+02 + 4 2.731070100083522e+02 2.386736268129995e+02 7.781587548733638e+01 -1.075501199523767e+02 + 5 4.141289059631002e+02 3.382656451706627e+02 -8.749329931705429e+01 -2.223151515891034e+02 + ME 2.301107143120126e-07 + +Event 45 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.489710523078152e+02 -1.087275227839726e+01 -1.135937836198698e+01 -2.484740080338036e+02 + 3 5.496834214369438e+02 3.652599963991979e+02 -1.627221171722717e+02 3.771717266923071e+02 + 4 3.590003089931697e+02 -3.273342096749049e+02 -3.092057510665312e+01 1.441438693416079e+02 + 5 3.423452172620712e+02 -2.705303444589564e+01 2.050020706409118e+02 -2.728415880001114e+02 + ME 4.667251809043753e-06 + +Event 46 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.597985691912828e+02 -2.306727818104758e+02 7.603141863539261e+01 9.222032132856624e+01 + 3 7.002947929838681e+02 4.239737553652563e+02 1.188001021383240e+01 -5.572413455578470e+02 + 4 2.992434877229012e+02 -1.661439730366823e+02 1.360103548958277e+01 2.485112814376109e+02 + 5 2.406631501019479e+02 -2.715700051809825e+01 -1.015124643388078e+02 2.165097427916698e+02 + ME 2.961095153521793e-06 + +Event 47 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.018388784182160e+02 8.642536924923899e+00 -7.368701360758433e+01 1.877084532518991e+02 + 3 6.387534662808698e+02 -2.318851129196156e+02 1.997708430514635e+02 -5.606486380600519e+02 + 4 2.649820787300667e+02 1.788659005461100e+02 -1.950823496502483e+02 -1.285956938916121e+01 + 5 3.944255765708474e+02 4.437667544858177e+01 6.899852020636909e+01 3.857997541973140e+02 + ME 5.566758395539804e-04 + +Event 48 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.027019167519019e+02 -6.158689118671298e+01 4.178205606243037e+02 -2.726577544460905e+02 + 3 5.743238667714224e+02 3.184605495288457e+02 -1.749486635874764e+02 4.447738160526316e+02 + 4 3.758842972635484e+02 -2.607694178749530e+02 -2.399878564004706e+02 -1.252762723374896e+02 + 5 4.708991921312749e+01 3.895759532820398e+00 -2.884040636356642e+00 -4.683978926905154e+01 + ME 1.161565714908134e-06 + +Event 49 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.764648203352892e+02 2.738178952147447e+01 -3.157099897407809e+02 -3.558035483207229e+02 + 3 4.895961256288143e+02 -3.256740061063621e+02 3.286155635585781e+02 1.601643511036614e+02 + 4 1.166509262431076e+02 1.759755750888453e+01 3.782228425839705e+01 -1.089368595865213e+02 + 5 4.172881277927895e+02 2.806946590760029e+02 -5.072785807619405e+01 3.045760568035828e+02 + ME 2.001075766821350e-08 + +Event 50 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.289894155958066e+02 4.693347168027174e+02 -2.050228258252125e+02 -1.323645205738705e+02 + 3 3.157727461963105e+02 1.138825334157519e+02 2.859620904593797e+02 -7.049027339478296e+01 + 4 5.977725833195259e+02 -5.754593992719118e+02 -5.502927305745767e+01 1.521522929990793e+02 + 5 5.746525488835672e+01 -7.757850946557377e+00 -2.590999157670938e+01 5.070250096957403e+01 + ME 2.027073801530859e-07 + +Event 51 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.219327715020971e+02 5.984785298499140e+01 -4.338034317791254e+01 2.092618823041012e+02 + 3 3.719694454596720e+02 2.481003316167898e+02 6.684163406846994e+01 -2.689603869759849e+02 + 4 5.038009677439173e+02 -3.561225019008226e+02 3.408324299472029e+02 -1.040453431725396e+02 + 5 4.022968152943128e+02 4.817431729904129e+01 -3.642937208377600e+02 1.637438478444240e+02 + ME 9.141635997867289e-08 + +Event 52 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.643797726078951e+02 -1.708128057115347e+02 -2.017258340575682e+02 5.132001844727675e+00 + 3 1.082614026075099e+02 8.023457653018268e+01 -7.166510600964264e+01 -1.212668974594048e+01 + 4 5.888736361086695e+02 1.251923882116535e+02 2.277230615162054e+02 -5.284328079249008e+02 + 5 5.384851886759255e+02 -3.461415903030152e+01 4.566787855100542e+01 5.354274958261134e+02 + ME 1.101728250408918e-06 + +Event 53 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.236225962870595e+02 -8.655229351820060e+01 -2.025655916342638e+02 3.850905273783226e+01 + 3 3.598708704269267e+02 -3.246728222408163e+01 3.017281195636015e+02 -1.934245610863020e+02 + 4 4.959901518772790e+02 3.780916831344549e+02 2.233019308120419e+02 2.306277468116429e+02 + 5 4.205163814087352e+02 -2.590721073921725e+02 -3.224644587413796e+02 -7.571223846317316e+01 + ME 2.346330708569022e-07 + +Event 54 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.833324108095403e+02 3.689695429376331e+02 -9.184861439378329e+01 -4.867284248315899e+01 + 3 1.733517466553474e+02 -1.283859018369739e+02 1.142418318268169e+02 2.273086313407226e+01 + 4 5.882352328920327e+02 -5.696191122096893e+02 -1.438232518530712e+02 -2.948946332840464e+01 + 5 3.550806096430796e+02 3.290354711090303e+02 1.214300344200377e+02 5.543144267749134e+01 + ME 1.794998655287127e-07 + +Event 55 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.230627174531746e+02 4.559332054335195e+00 -3.968535050007001e+02 -1.465215716498506e+02 + 3 3.495042365247251e+02 -1.042339596976066e+02 -8.990135550403500e+01 -3.212572789417008e+02 + 4 2.159564078947227e+02 -5.070199742482324e+01 1.962132698937686e+02 7.461119410329924e+01 + 5 5.114766381273785e+02 1.503766250680946e+02 2.905415906109669e+02 3.931676564882520e+02 + ME 3.641365922461726e-07 + +Event 56 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.301047227118417e+02 -1.540011276808692e+02 1.679727819701612e+02 3.189013334119058e+01 + 3 8.914969805191234e+01 6.109731064089875e+01 -5.747680497126689e+01 -3.018615883963399e+01 + 4 6.600518622851478e+02 5.103775210112311e+02 -4.083637358386221e+02 -9.177312354086116e+01 + 5 5.206937169510982e+02 -4.174737039712606e+02 2.978677588397277e+02 9.006914903930458e+01 + ME 1.512772939016334e-02 + +Event 57 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.428929101034773e+02 4.260395144351128e+02 1.172634949350701e+02 -2.989539622472092e+01 + 3 2.959829527779269e+02 -1.135559932551681e+02 -1.425261047989344e+02 -2.332321894203626e+02 + 4 4.151099697755373e+02 -1.242288834014605e+01 -6.521586552925591e+01 4.097668236150967e+02 + 5 3.460141673430584e+02 -3.000606328397987e+02 9.047847539312012e+01 -1.466392379700132e+02 + ME 2.225679869813506e-08 + +Event 58 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.357156983149218e+02 -5.877712383564344e+01 2.922019789681376e+02 3.178229427822564e+02 + 3 3.319248809135803e+02 -2.788588871424603e+01 2.781385476596405e+02 -1.789845023623472e+02 + 4 1.585962071308598e+02 -2.424722381792027e+01 -7.876980550994806e+01 1.354996191630929e+02 + 5 5.737632136406388e+02 1.109102363678097e+02 -4.915707211178297e+02 -2.743380595830021e+02 + ME 1.677800521082310e-08 + +Event 59 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.336459069578591e+02 -1.992843481885653e+01 5.107665464174290e+02 -1.532917041439582e+02 + 3 1.581511369245756e+02 -9.743817802085913e+01 -5.863052068271146e+01 -1.099092608426849e+02 + 4 5.888078775864335e+02 2.639198311142280e+02 -3.515067392327602e+02 3.917704069048989e+02 + 5 2.193950785311321e+02 -1.465532182745124e+02 -1.006292865019574e+02 -1.285694419182558e+02 + ME 2.491091844960630e-08 + +Event 60 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.398861408394455e+02 9.292985018883905e+01 -1.755261709518740e+02 2.758209473364647e+02 + 3 4.495621192790998e+02 1.515758570389179e+02 -3.450839350786028e+02 -2.450468045176262e+02 + 4 5.587170539667186e+02 -2.339829412606645e+02 4.760297086658421e+02 -1.755347431700376e+02 + 5 1.518346859147359e+02 -1.052276596709246e+01 4.458039736463481e+01 1.447606003511990e+02 + ME 2.613510262747367e-06 + +Event 61 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.175194209979109e+01 5.998402058062321e+01 -1.454594179391588e+01 -1.906621464091404e+00 + 3 3.885205042949811e+02 2.010533504306213e+02 -3.301647653931597e+02 3.894817365587326e+01 + 4 4.121877137292709e+02 -1.873608925987553e+02 -2.415372201761172e+02 -2.765038490573954e+02 + 5 6.375398398759570e+02 -7.367647841248927e+01 5.862479273631928e+02 2.394622968656136e+02 + ME 4.036115402945072e-07 + +Event 62 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.423805753542257e+02 -2.842742526400005e+02 1.403994964620940e+02 1.292307589509954e+02 + 3 3.730016642701108e+02 -6.375176338198228e+01 3.556485890785658e+02 9.262847996370058e+01 + 4 1.869146245818500e+02 1.512767776127116e+02 -3.521592858368061e+01 1.039819783023830e+02 + 5 5.977031357938134e+02 1.967492384092712e+02 -4.608321569569791e+02 -3.258412172170790e+02 + ME 4.041152133461143e-08 + +Event 63 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.204233175784373e+02 -1.250932662423830e+01 -4.690369092824185e+02 -2.251406759285211e+02 + 3 1.393553568983892e+02 8.467826532496939e+01 1.098936348220459e+02 -1.314898868469838e+01 + 4 2.210675038562465e+02 -1.091496397100441e+02 -1.833244208077437e+02 -5.787360493268422e+01 + 5 6.191538216669276e+02 3.698070100931305e+01 5.424676952681161e+02 2.961632695459039e+02 + ME 1.717359783444398e-08 + +Event 64 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.285187126492505e+02 5.115930460477168e+02 2.913679691486133e+02 -2.200296205970973e+02 + 3 3.530874069984856e+02 -5.757495613833129e+01 -3.215266291557356e+02 1.340763519429640e+02 + 4 2.198421555599027e+02 -1.613778626189666e+02 6.174872575212309e+01 1.359222339639379e+02 + 5 2.985517247923616e+02 -2.926402272904186e+02 -3.159006574500085e+01 -4.996896530980442e+01 + ME 3.515639330107456e-08 + +Event 65 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.733821832722187e+02 2.303260106007256e+02 -1.646059812172806e+00 1.472584036940988e+02 + 3 1.394921625216098e+02 -9.684562217110877e+01 4.145751137329390e+01 -9.143447720624464e+01 + 4 5.102700480977415e+02 3.604369604937131e+02 -2.998931993418236e+02 -2.013076960383037e+02 + 5 5.768556061084297e+02 -4.939173489233301e+02 2.600817477807025e+02 1.454837695504496e+02 + ME 2.842804460766261e-08 + +Event 66 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.544462431538032e+02 -3.753345008667602e+02 -2.512516495498346e+02 5.017976658168833e+01 + 3 3.165563250580394e+02 1.103952965289287e+02 -2.325997113597451e+02 -1.841688348062841e+02 + 4 3.684879093534595e+02 6.572067686856795e+01 2.241075105812512e+02 2.850262368146046e+02 + 5 3.605095224346968e+02 1.992185274692629e+02 2.597438503283281e+02 -1.510371685900090e+02 + ME 7.527958990971987e-09 + +Event 67 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.440190035555439e+02 2.253514126547281e+02 -1.260243593843264e+00 -9.359715111499490e+01 + 3 3.825895783887985e+02 -2.104023034558340e+02 -1.413119979724069e+02 -2.865947930811798e+02 + 4 4.876346193001702e+02 -2.655995372194985e+02 -1.506557767336875e+02 3.801936936688712e+02 + 5 3.857567987554871e+02 2.506504280206045e+02 2.932280182999376e+02 -1.749472696388871e-03 + ME 6.340516691841314e-08 + +Event 68 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.035832133728723e+02 -2.971861821973760e+02 1.881283924203020e+02 -4.905107303024299e+02 + 3 4.910040511213286e+02 6.999750717015648e+01 6.653935986886803e+01 4.814123396795738e+02 + 4 1.379763939873844e+02 1.378874442733864e+02 -2.922008491766642e+00 -3.999984486182566e+00 + 5 2.674363415184150e+02 8.930123075383310e+01 -2.517457437974033e+02 1.309837510903859e+01 + ME 4.388801509248543e-07 + +Event 69 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.030381868442191e+02 2.272305877240549e+02 1.124384392229257e+01 3.326854611056926e+02 + 3 4.200661478070353e+02 -7.160564158348356e+01 3.787564442608000e+02 -1.669483649448990e+02 + 4 3.799818119209206e+02 1.049459036478361e+02 -3.642939882503814e+02 -2.573761494148145e+01 + 5 2.969138534278246e+02 -2.605708497884073e+02 -2.570629993271116e+01 -1.399994812193120e+02 + ME 7.378457328980502e-09 + +Event 70 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.167686328179870e+02 -7.473352909257788e+00 2.955788092743163e+01 2.146138878198537e+02 + 3 4.902322685632545e+02 6.433192499326779e+01 -2.241342640720431e+02 -4.312225785325012e+02 + 4 4.524638474271014e+02 1.238344991848496e+02 4.320121481028073e+02 5.247908089678296e+01 + 5 3.405352511916566e+02 -1.806930712688596e+02 -2.374357649581957e+02 1.641296098158646e+02 + ME 2.274169082581569e-07 + +Event 71 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.536202816327036e+02 -2.094555017497595e+02 -3.528962486096462e+02 -1.932976730654783e+02 + 3 4.830671392542954e+02 -1.130399763251615e+02 4.195776068459593e+02 -2.110224078460707e+02 + 4 1.227243866738444e+02 7.580385517780424e+01 7.042433585630759e+01 6.599593581313735e+01 + 5 4.405881924391570e+02 2.466916228971166e+02 -1.371056940926206e+02 3.383241450984115e+02 + ME 2.734689217004170e-07 + +Event 72 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.855461065651832e+02 -6.960662796769586e+01 6.543639335521405e+02 -1.921879913933706e+02 + 3 1.544820687132741e+02 -4.432993363636480e+01 -1.101873255905340e+02 9.878420833784054e+01 + 4 1.673033372364590e+02 6.926279643044644e+01 -5.591621839923216e+01 1.416560912951512e+02 + 5 4.926684874850839e+02 4.467376517361424e+01 -4.882603895623743e+02 -4.825230823962138e+01 + ME 1.464452520670558e-06 + +Event 73 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.670906658704048e+01 -5.458408693475395e+00 -1.816753288898089e+01 1.880214778193577e+01 + 3 6.242100951925031e+02 -5.543116507793767e+02 2.403203993953239e+02 1.569169920210937e+02 + 4 4.464355243035054e+02 1.923986431943475e+02 -3.808569523700175e+02 -1.312837434282789e+02 + 5 4.026453139169510e+02 3.673714162785045e+02 1.587040858636745e+02 -4.443539637475060e+01 + ME 5.056209382361270e-07 + +Event 74 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.507575031696209e+02 -1.392071235576032e+02 -5.787209256044108e+01 -1.497178419166361e-01 + 3 6.125489314065022e+02 6.060714876125713e+02 -8.678795919437111e+01 1.901040265772211e+01 + 4 3.363511975714604e+02 -2.665860722445908e+02 -1.847740056381793e+02 8.900876941840997e+01 + 5 4.003423678524158e+02 -2.002782918103775e+02 3.294340573929915e+02 -1.078694542342155e+02 + ME 1.168068775703116e-08 + +Event 75 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.949609689268431e+01 2.952198203772742e+01 1.197957151721274e+01 -2.334275289215887e+01 + 3 4.190786176726720e+02 1.383393001415919e+02 -2.870901836423846e+02 2.721550151714166e+02 + 4 4.516269609304362e+02 -4.017945448128341e+02 -2.062213189338416e+02 -9.073120168422382e-01 + 5 5.897983245042077e+02 2.339332626335147e+02 4.813319310590135e+02 -2.479049502624156e+02 + ME 2.941632001717120e-06 + +Event 76 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.715950956231763e+02 -5.685992331526843e+02 -4.378901164006451e+01 3.870901520620015e+01 + 3 2.332327857868953e+02 1.165329531079329e+02 -1.925039496033648e+02 -6.131747379797516e+01 + 4 3.244822622272620e+02 1.227113072274081e+02 2.628412386324300e+02 -1.454137436861874e+02 + 5 3.706898563626651e+02 3.293549728173442e+02 -2.654827738900049e+01 1.680222022779630e+02 + ME 9.769801130347612e-09 + +Event 77 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.469171932013446e+02 6.704141444804746e+01 -6.996151281265823e+01 -2.271099640530981e+02 + 3 6.144793631549348e+02 -4.733339087492145e+02 9.433660898623474e+01 3.803163193664752e+02 + 4 1.021761213297807e+02 -9.029119947808269e+01 1.069887339263507e+01 4.661537487713156e+01 + 5 5.364273223139398e+02 4.965836937792497e+02 -3.507396956621159e+01 -1.998217301905087e+02 + ME 2.371712188035742e-04 + +Event 78 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.967497774816139e+02 -2.697858484391464e+02 6.318686962407353e+02 1.158352239764815e+02 + 3 6.532937684360128e+01 -1.968556360581403e+00 -5.408995210783413e+01 -3.658318391830724e+01 + 4 6.825170260279161e+02 3.242089958026898e+02 -5.955481599371362e+02 -7.772005567479935e+01 + 5 5.540381964686839e+01 -5.245459100296187e+01 1.776941580423528e+01 -1.531984383375091e+00 + ME 3.944930246763812e-05 + +Event 79 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.067690710988119e+02 -2.411523221575334e+02 7.258473255078881e+01 -1.751692812721097e+02 + 3 4.341993133670206e+02 -2.152390662369685e+02 -1.744359138858042e+02 3.343251412033717e+02 + 4 4.316885855045251e+02 4.064167724737122e+02 -7.391084315808247e+00 1.453472179974050e+02 + 5 3.273430300296425e+02 4.997461592078955e+01 1.092422656508236e+02 -3.045030779286670e+02 + ME 2.636340331103492e-07 + +Event 80 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.542875433020407e+01 3.027762979867649e+01 3.286699452335475e+01 -8.172975270371373e+00 + 3 4.747123602854809e+02 -4.741886647646234e+02 -1.797507628065838e+01 -1.318453068032194e+01 + 4 5.041118878446804e+02 2.786633966721448e+02 2.426995114610605e+02 -3.428883986926841e+02 + 5 4.757469975396347e+02 1.652476382938020e+02 -2.575914297037568e+02 3.642459046433774e+02 + ME 2.312691115991803e-07 + +Event 81 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.864116929519440e+02 3.277381135111354e+02 3.711141341299856e+02 3.142303327686207e+02 + 3 3.685765254538676e+02 -3.416666086612904e+02 -1.007529100226153e+02 -9.466485495426969e+01 + 4 2.326234951979821e+02 1.494507175101483e+02 8.025952299133927e+00 -1.780835692688112e+02 + 5 3.123882863962061e+02 -1.355222223599932e+02 -2.783871764065043e+02 -4.148190854553969e+01 + ME 2.172975415413290e-08 + +Event 82 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.427639687834171e+02 9.380520328619317e+00 1.408494872006386e+02 3.123468682248004e+02 + 3 6.478929207969747e+02 -1.321126375113857e+02 -3.734561186667145e+02 -5.126812023655762e+02 + 4 3.203811193174652e+02 -1.134795021127588e+01 1.346301271349556e+02 2.904995946791319e+02 + 5 1.889619911021436e+02 1.340800673940422e+02 9.797650433112031e+01 -9.016526053835609e+01 + ME 2.530448161241236e-08 + +Event 83 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.062422362872099e+02 5.909870990240152e+01 2.721577348549549e+02 1.273648975686913e+02 + 3 2.466716629423465e+02 -1.034354582623313e+02 -1.868503752330573e+02 1.234299499638147e+02 + 4 4.906943885930805e+02 5.161889116502462e+01 3.651680763098555e+02 -3.236800103741192e+02 + 5 4.563917121773633e+02 -7.282142805094902e+00 -4.504754359317530e+02 7.288516284161318e+01 + ME 1.432872587895112e-08 + +Event 84 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.336579016443923e+02 -7.328260784589017e+01 1.067078727201235e+02 -3.327948255056634e+01 + 3 6.164528201146906e+02 -4.207287356636680e+02 6.823455463559318e+01 4.453599173341587e+02 + 4 1.467908941354667e+02 -2.567468685241225e+01 -7.754572458221435e+01 -1.219632635474294e+02 + 5 6.030983841054507e+02 5.196860303619703e+02 -9.739670277350223e+01 -2.901171712361628e+02 + ME 2.657444769492023e-07 + +Event 85 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.358447951050334e+02 -1.672088219728145e+02 2.854579619038340e+02 5.785056818489146e+01 + 3 4.684481782024461e+02 3.813275907644369e+02 3.444658097706863e+01 -2.699007173795044e+02 + 4 4.650926385624551e+02 -3.110546686540761e+02 -1.387147979935619e+02 3.167244233890000e+02 + 5 2.306143881300663e+02 9.693589986245351e+01 -1.811897448873403e+02 -1.046742741943869e+02 + ME 6.094494816029316e-09 + +Event 86 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.346862187327210e+02 -1.105019805657391e+02 -2.067698421018028e+02 1.063794864187419e+01 + 3 5.058624304519142e+02 -4.262206430615211e+02 2.700548789916521e+02 3.609881305175242e+01 + 4 6.480280992696387e+02 6.454938193212064e+02 -5.106895467449709e+01 -2.588645455826068e+01 + 5 1.114232515457259e+02 -1.087711956939464e+02 -1.221608221535224e+01 -2.085030713536592e+01 + ME 1.075942693608974e-07 + +Event 87 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.728899334305392e+02 4.659191099288728e+02 -3.129457850944544e+01 -7.459840145031787e+01 + 3 4.494169400629931e+02 -7.010118226273991e+01 1.008358429699956e+02 4.323118585390418e+02 + 4 2.844458559289023e+01 -7.098330388355537e+00 -9.888008676198385e+00 2.570866468077314e+01 + 5 5.492485409135777e+02 -3.887195972777777e+02 -5.965325578435165e+01 -3.834221217694970e+02 + ME 2.788995718688161e-03 + +Event 88 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.984878614180074e+02 -8.577726579355449e+01 4.910354340013713e+02 -4.075685116146818e+00 + 3 4.609805056931713e+02 2.129284200140230e+02 -3.890221250310132e+02 1.258026264265514e+02 + 4 2.969828717410930e+02 -2.479988381372318e+02 1.094380361139229e+02 -1.213207263483755e+02 + 5 2.435487611477274e+02 1.208476839167629e+02 -2.114513450842807e+02 -4.062149620293105e-01 + ME 4.726540488185185e-09 + +Event 89 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.726870008116489e+02 1.539765893236599e+02 -3.662906941914415e+02 2.560377667948944e+02 + 3 4.594378962938941e+02 -1.479060782129438e+02 4.245625574332341e+02 9.462350344590692e+01 + 4 1.718353569329644e+02 -3.298783981694856e+01 -1.660422757461217e+02 2.948143452309453e+01 + 5 3.960397459614918e+02 2.691732870623243e+01 1.077704125043293e+02 -3.801427047638959e+02 + ME 6.682360637310473e-08 + +Event 90 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.474695652900048e+02 -9.333174748729689e+01 1.132426971448449e+02 1.457906516726570e+01 + 3 3.251565719826368e+02 2.585053334584991e+02 -2.175136337350804e+00 -1.972233699613810e+02 + 4 5.790098794634627e+02 6.564529205760054e+01 -4.580486422816988e+02 3.480439303462397e+02 + 5 4.483639832638957e+02 -2.308188780288028e+02 3.469810814742047e+02 -1.653996255521244e+02 + ME 3.237193462730394e-07 + +Event 91 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.820899052888926e+02 -1.808581415393255e+02 -3.203199804488940e+02 1.033350786898440e+02 + 3 9.295446296256254e+01 -2.298114143078748e+01 3.555880655718099e+01 8.275246582081729e+01 + 4 4.336620114613530e+02 8.456554310968212e+01 -2.782844067212401e+02 -3.216662837150925e+02 + 5 5.912936202871921e+02 1.192737398604308e+02 5.630455806129528e+02 1.355787392044316e+02 + ME 7.460683189703289e-08 + +Event 92 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.270593439871326e+02 5.052201207875459e+02 3.779850976205835e+01 1.453115766046593e+02 + 3 2.463043854963668e+02 1.231304177150150e+02 -3.773893720068452e+01 -2.099536214668173e+02 + 4 6.690437393076164e+02 -6.594864940642269e+02 8.207385292371979e+00 1.123820629827312e+02 + 5 5.759253120888413e+01 3.113595556166609e+01 -8.266957853745714e+00 -4.774001812057303e+01 + ME 1.868005487718810e-07 + +Event 93 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.200017882248952e+02 -1.560830085677291e+02 -1.325663041349293e+02 8.040557360409328e+01 + 3 6.407281865953224e+02 1.823233391623662e+02 6.142037882279665e+02 -6.672002760069836e+00 + 4 8.521385755377796e+01 4.383951335035401e+01 5.404958342950356e+01 4.917459831556156e+01 + 5 5.540561676260038e+02 -7.007984394499141e+01 -5.356870675225406e+02 -1.229081691595848e+02 + ME 8.303271327058974e-07 + +Event 94 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.370935815124155e+02 5.275516558105982e+02 1.000073439779119e+02 1.254180624206741e+01 + 3 5.057716473607446e+02 -3.633458733842591e+02 -1.608969609488800e+02 -3.128848087172752e+02 + 4 9.454406003253546e+01 -6.967807726468104e+01 4.085645048107036e+01 -4.913547893549688e+01 + 5 3.625907110943047e+02 -9.452770516165786e+01 2.003316648989775e+01 3.494784814107046e+02 + ME 1.250609356373499e-06 + +Event 95 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.256779780037450e+02 -3.004969003882455e+02 9.471368436356252e+00 1.252120257547474e+02 + 3 2.582480939333944e+02 -2.012383405486748e+02 7.138897109957118e+01 1.452543394146680e+02 + 4 5.694131298692070e+02 2.847102884863675e+02 8.193724006527485e+01 -4.862691156015786e+02 + 5 3.466607981936535e+02 2.170249524505529e+02 -1.627975796012024e+02 2.158027504321630e+02 + ME 1.075954462784556e-06 + +Event 96 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.766185052317254e+02 4.233490188089946e+02 1.329496801040322e+01 -2.185498836155904e+02 + 3 7.890873879535889e+01 -4.740418313873944e+01 4.641498695348351e+01 4.272097219552690e+01 + 4 4.792869803542939e+02 -2.336105143624531e+02 3.831040160425885e+02 1.684442044372187e+02 + 5 4.651857756186217e+02 -1.423343213078020e+02 -4.428139710064752e+02 7.384706982844816e+00 + ME 7.018146836849039e-06 + +Event 97 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.454191314938116e+02 -2.198423252847823e+02 -1.067171314055372e+02 -2.261318157911463e+01 + 3 6.283883510117278e+02 4.722599592005207e+02 -1.937207663271801e+02 -3.664897206154232e+02 + 4 1.620781893868624e+02 -7.571987944222909e+01 -4.595113060672451e+01 1.357362623919699e+02 + 5 4.641143281075982e+02 -1.766977544735093e+02 3.463890283394419e+02 2.533666398025678e+02 + ME 1.041263244953328e-07 + +Event 98 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.982747694543485e+02 4.799793179522204e+02 -1.370111687864802e+01 -1.330784716217580e+02 + 3 5.240661916116436e+02 -3.125813856230740e+02 1.929132879729713e+02 3.737950158519351e+02 + 4 2.943287261583366e+02 -2.480543997855069e+02 -1.355691835838943e+02 -8.197200895084492e+01 + 5 1.833303127756715e+02 8.065646745636055e+01 -4.364298751042897e+01 -1.587445352793322e+02 + ME 9.367943186127541e-08 + +Event 99 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.112057021621225e+02 -2.049393130332368e+02 -2.328035275724480e+01 -4.545054207331783e+01 + 3 5.066243223503690e+02 -2.358079378948771e+02 -4.471534870778941e+02 3.341525724599694e+01 + 4 4.653239426414495e+02 4.182795598186355e+02 1.760198094306581e+02 -1.028863845283059e+02 + 5 3.168460328460583e+02 2.246769110947843e+01 2.944140304044811e+02 1.149216693556268e+02 + ME 2.493269729158601e-08 + +Event 100 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.261030820391055e+02 -1.819121811113250e+01 6.012879360431592e+01 1.093416437734688e+02 + 3 4.769048092319793e+02 -2.259295543204739e+02 1.301569921137044e+02 -3.993159037116550e+02 + 4 5.269499564110497e+02 2.313940015147070e+02 1.423959739422059e+02 4.515046613598465e+02 + 5 3.700421523178656e+02 1.272677091689940e+01 -3.326817596602262e+02 -1.615304014216603e+02 + ME 2.131711776325623e-08 + +Event 101 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.071082658590778e+02 -2.306343931636615e+02 -1.906219995172719e+02 -2.760582730527242e+02 + 3 7.055902937697462e+02 3.509939199461517e+02 9.908499693062988e+01 6.040224285621921e+02 + 4 1.506354165156303e+02 -3.616228611940872e+00 8.811142866916485e+01 -1.221242307543523e+02 + 5 2.366660238555452e+02 -1.167432981705493e+02 3.425573917477071e+00 -2.058399247551156e+02 + ME 5.689188294381573e-06 + +Event 102 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.679004164430065e+02 1.260994283332420e+02 8.794295354075821e+01 4.419228109116983e+02 + 3 2.893886902350230e+02 1.121360101901791e+02 -2.665918046579882e+02 -1.000694477686554e+01 + 4 1.114482866002886e+02 7.246487011182148e+01 -5.194724669148226e+01 6.686588627238250e+01 + 5 6.312626067216818e+02 -3.107003086352427e+02 2.305960978087122e+02 -4.987817524072153e+02 + ME 3.143301099713104e-07 + +Event 103 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.015611695754375e+02 -2.478016296999642e+02 2.089981655060758e+02 -2.369925991332430e+02 + 3 2.805421352009302e+02 2.015483315378810e+02 1.930950025069369e+02 2.822197165754875e+01 + 4 2.160334239749209e+02 1.520789029588300e+02 1.534112976413324e+02 2.724207255193556e+00 + 5 6.018632712487113e+02 -1.058256047967467e+02 -5.555044656543449e+02 2.060464202205007e+02 + ME 1.737002284745109e-04 + +Event 104 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.888143656522529e+02 5.760997201270005e+02 9.608601910085935e+01 7.469234633753238e+01 + 3 3.060271446633420e+02 -1.984745521217481e+02 2.204192649519959e+02 7.533799220108909e+01 + 4 3.967306659051177e+02 -3.423810873408198e+02 -1.995377379946823e+02 1.884418787853176e+01 + 5 2.084278237792874e+02 -3.524408066443250e+01 -1.169675460581730e+02 -1.688745264171534e+02 + ME 1.991151547630254e-08 + +Event 105 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.504015321144005e+02 1.426088302035090e+02 -2.057470186990172e+02 5.640312907289355e+00 + 3 1.979630815113672e+02 4.982132386022299e+01 1.913292686370453e+02 1.001640120524147e+01 + 4 4.659014384742007e+02 2.574185871348824e+02 -5.255943452024413e+01 -3.847561919095292e+02 + 5 5.857339479000306e+02 -4.498487411986147e+02 6.697718458221685e+01 3.690994777969989e+02 + ME 3.090366930429195e-08 + +Event 106 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.361513001939431e+02 -3.741524637210580e+01 1.304189622135487e+01 7.350841754922350e+02 + 3 4.122811658031853e+02 1.145408114286395e+02 -7.459440103785865e+00 -3.959804527330110e+02 + 4 2.337908007462917e+02 -6.120784213133228e+01 -2.574540563694115e+01 -2.241626923995327e+02 + 5 1.177767332565806e+02 -1.591772292520144e+01 2.016294951937231e+01 -1.149410303596913e+02 + ME 9.317247557874130e-03 + +Event 107 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.705546785847132e+02 1.262951978594903e+01 -9.714584685273634e+01 1.396140324479450e+02 + 3 6.671070399945476e+02 2.578935168541208e+02 6.114409935711719e+02 -6.828358628248792e+01 + 4 2.745513521168063e+02 -2.324733487602298e+02 -1.283830712319619e+02 -6.965898424769125e+01 + 5 3.877869293039331e+02 -3.804968787984004e+01 -3.859120754864737e+02 -1.671461917765819e+00 + ME 5.818871599878634e-08 + +Event 108 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.706224267805914e+02 -4.353661443150737e+01 9.404361586016130e+01 1.355447307728594e+02 + 3 2.687777512938866e+02 2.082248009387954e+02 3.014322059927089e+01 1.672581780257728e+02 + 4 3.692025609623798e+02 3.365346633914971e+02 2.321314261258354e+01 1.500536616254310e+02 + 5 6.913972609631422e+02 -5.012228498987852e+02 -1.473999790720158e+02 -4.528565704240632e+02 + ME 5.619214334297136e-06 + +Event 109 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.466785643841017e+02 -2.365744581492209e+02 2.771136527000190e+02 -2.583839954774635e+02 + 3 3.878915327990414e+02 2.483945414202765e+02 -1.631710090657951e+02 -2.492693619566003e+02 + 4 1.253118648319771e+02 4.194400500436873e+01 1.160335020033092e+02 2.190868149180264e+01 + 5 5.401180379848795e+02 -5.376408827542427e+01 -2.299761456375331e+02 4.857446759422613e+02 + ME 6.905592031281615e-07 + +Event 110 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.381411416550031e+02 6.702134331125377e+01 -1.162827706142533e+02 -1.967172085705170e+02 + 3 6.212430200653040e+02 -3.967265104066120e+02 1.827510534384594e+02 4.417612685462425e+02 + 4 1.823603315775934e+02 2.482060450627628e+01 -1.599146046747259e+02 8.406275832233294e+01 + 5 4.582555067020996e+02 3.048845625890820e+02 9.344632185051984e+01 -3.291068182980586e+02 + ME 5.999106841983196e-07 + +Event 111 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.374985436753868e+02 -2.389765770808209e+02 -2.262746633651867e+02 7.480133226562783e+01 + 3 1.687316160676459e+02 -3.174949652867830e+01 -1.649088651344595e+02 1.635218428955414e+01 + 4 5.145505242681481e+02 2.608128140187750e+01 2.183177034136404e+02 -4.652089736465277e+02 + 5 4.792193159888192e+02 2.446447922076218e+02 1.728658250860057e+02 3.740554570913458e+02 + ME 3.382335155491244e-07 + +Event 112 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.705149137771966e+02 -1.687461219713958e+02 -2.442102979231829e+01 -1.922353750557715e+00 + 3 6.869944158795414e+02 2.444246111927705e+02 5.622677815177788e+02 3.099562529777551e+02 + 4 6.218245605391450e+02 -5.521148956020889e+01 -5.388632075909081e+02 -3.053586727359833e+02 + 5 2.066610980411796e+01 -2.046699966116588e+01 1.016455865447357e+00 -2.675226491214084e+00 + ME 8.157831235667544e-03 + +Event 113 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.130363090939669e+02 1.028036904144503e+01 1.180709930084098e+01 -1.119469231022390e+02 + 3 4.872852151720413e+02 7.570757814761660e+01 3.354557570129505e+02 -3.452313407313326e+02 + 4 5.826096241012513e+02 8.602003542439087e+01 -5.178852261950345e+02 2.526448497197317e+02 + 5 3.170688516327408e+02 -1.720079826134526e+02 1.706223698812430e+02 2.045334141138398e+02 + ME 2.369877951462478e-06 + +Event 114 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.488748255497193e+02 -5.783809298403492e+01 -4.190201165497775e+02 1.502181944335373e+02 + 3 3.885313578527227e+02 -3.031237352159773e+02 6.719013528635190e+01 -2.335767601962924e+02 + 4 1.721991327065912e+02 1.359105605864553e+02 -8.348365698663603e+01 6.489483679088326e+01 + 5 4.903946838909665e+02 2.250512676135569e+02 4.353136382500616e+02 1.846372897187182e+01 + ME 8.587285951331933e-09 + +Event 115 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.614867481905577e+02 -2.044427690075313e+02 5.545544724970484e+02 -2.970474865501677e+02 + 3 5.299069281136580e+02 2.787046279045812e+01 -4.218547569036955e+02 3.194732443326408e+02 + 4 2.112135717499224e+02 1.481206856190750e+02 -1.329299397515352e+02 7.071821901566830e+01 + 5 9.739275194586149e+01 2.845162059799823e+01 2.302241581822219e-01 -9.314397679814118e+01 + ME 1.508971290876639e-06 + +Event 116 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.273084495291165e+02 2.026422934429814e+02 -1.202888698929614e+01 2.567532432494704e+02 + 3 3.558310071858317e+02 2.658361944151384e+02 -1.056345448838793e+02 -2.116321486469763e+02 + 4 4.261839858364658e+02 -3.197528890451962e+02 -4.295116688092089e+01 2.784709625693038e+02 + 5 3.906765574485858e+02 -1.487255988129233e+02 1.606145987540962e+02 -3.235920571717980e+02 + ME 5.227236495827973e-09 + +Event 117 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.398873526503554e+02 -1.697886094974673e+02 2.463425145650708e+02 -1.612780397150885e+02 + 3 3.484530288300059e+02 3.072641586090561e+02 1.508174714846009e+02 6.528660222484777e+01 + 4 4.599006329697285e+02 8.910216575675491e+00 -4.016585734725035e+02 -2.238293783277009e+02 + 5 3.517589855499089e+02 -1.463857656872638e+02 4.498587422832148e+00 3.198208158179417e+02 + ME 6.978074268652368e-09 + +Event 118 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.636830965937108e+02 -1.350752771701317e+02 -4.094705561989163e+01 -8.288645392330184e+01 + 3 5.340512912451784e+02 -4.440034731447416e+02 -2.645266954971571e+02 -1.345262981182612e+02 + 4 5.602763248554180e+02 5.415787847304526e+02 9.167439365323624e+01 1.104435859237257e+02 + 5 2.419892873056931e+02 3.749996558442060e+01 2.137993574638124e+02 1.069691661178374e+02 + ME 2.011985114805886e-06 + +Event 119 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.393958606639748e+01 5.505558390943226e+01 3.178415492333581e+01 6.849879016676006e+00 + 3 5.103236431617297e+02 1.902911879696913e+02 -2.878340806151940e+02 3.759933863611834e+02 + 4 5.716564501715116e+02 -6.229917649784063e+01 5.188259821571465e+02 -2.317962680222806e+02 + 5 3.540803206003613e+02 -1.830475953812831e+02 -2.627760564652884e+02 -1.510469973555788e+02 + ME 2.211473180544441e-07 + +Event 120 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.375361739991114e+02 2.755987077479748e+02 -1.573030807697318e+02 -5.529497863508248e+02 + 3 1.063537280575990e+02 -2.858383009670409e+01 1.513609572498958e+01 -1.013162313499492e+02 + 4 6.566870497058905e+01 3.794001125736362e+00 -5.133060130400937e+01 4.078178193435162e+01 + 5 6.904413929727007e+02 -2.508088787770070e+02 1.934975863487516e+02 6.134842357664223e+02 + ME 2.008478379670878e-04 + +Event 121 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.472844458410505e+02 -2.887724852567566e+02 -3.914139567665229e+02 -2.508502159986242e+02 + 3 1.439229258812056e+02 -7.748866294914937e+01 1.168245717185551e+02 -3.257813916319178e+01 + 4 5.996216437235049e+02 2.361653319523242e+02 3.405657659024143e+02 4.333439865751322e+02 + 5 2.091709845542396e+02 1.300958162535815e+02 -6.597638085444669e+01 -1.499156314133161e+02 + ME 8.193298253663049e-08 + +Event 122 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.721863109768985e+02 -3.366161482695963e+02 3.289037426473593e+02 -3.836416388601711e+01 + 3 3.043000907870282e+02 -6.182651453450119e+01 -1.228256199650946e+02 -2.714588264059559e+02 + 4 5.158404617584769e+02 2.163538140471089e+02 -2.337205323437163e+02 4.057796469704185e+02 + 5 2.076731364775958e+02 1.820888487569890e+02 2.764240966145164e+01 -9.595665667844540e+01 + ME 2.040699858317682e-08 + +Event 123 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.076427050299665e+02 2.733188279487669e+02 5.414201256136839e+01 -4.243425228923664e+02 + 3 5.243311343735360e+02 -5.185486655744496e+02 5.092607261139513e+01 5.862554931747282e+01 + 4 3.790262178945214e+02 1.868424378139138e+02 -5.104513483933526e+01 3.257992809881769e+02 + 5 8.899994270197608e+01 5.838739981176896e+01 -5.402295033342823e+01 3.991769258671668e+01 + ME 4.182562759973450e-06 + +Event 124 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.524096836537796e+02 3.931871421189149e+02 1.244094353277442e+02 -5.055339345672225e+02 + 3 4.453257968846719e+02 -2.749272877886792e+02 1.288694994577424e+02 3.257647984507223e+02 + 4 3.649512160194105e+02 -1.362256111675445e+02 -2.654954063631009e+02 2.101051216324037e+02 + 5 3.731330344213868e+01 1.796575683730913e+01 1.221647157761434e+01 -3.033598551590403e+01 + ME 4.942098831452094e-03 + +Event 125 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.915960233698099e+02 -1.654995845044201e+02 -3.447055388379848e+01 -9.017041984449877e+01 + 3 6.550796868804265e+02 4.495369576307072e+02 -2.048853220836559e+02 4.301952169447749e+02 + 4 4.328930401647468e+02 -1.738692079710811e+02 5.578157194523531e+01 -3.924975146085699e+02 + 5 2.204312495850171e+02 -1.101681651552061e+02 1.835743040222191e+02 5.247271750829375e+01 + ME 4.440304215155750e-08 + +Event 126 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.035701387136452e+02 9.136521656755118e+00 -8.511236795855699e+01 5.830250788488047e+01 + 3 5.780501404699306e+02 3.287868703966549e+02 4.051344723950226e+02 2.488116115242343e+02 + 4 2.667378796160922e+02 1.255353362675918e+02 -2.344633536281414e+02 -2.041841266398259e+01 + 5 5.516418412003314e+02 -4.634587283210018e+02 -8.555875080832423e+01 -2.866957067451323e+02 + ME 3.855870291536328e-08 + +Event 127 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.008254008745652e+02 -2.121113113583696e+02 -3.036077285076011e+01 2.111467188404314e+02 + 3 2.220883382942753e+02 -1.288724105068163e+02 -1.058087522116511e+02 -1.466957387652706e+02 + 4 4.817604596011059e+02 3.195498937336952e+02 -1.950871089920132e+02 3.031864537788441e+02 + 5 4.953258012300539e+02 2.143382813149056e+01 3.312566340544244e+02 -3.676374338540048e+02 + ME 7.778617714231280e-09 + +Event 128 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.497673085746553e+02 7.660884888714843e+01 -6.253597043077772e+02 1.589146934447123e+02 + 3 1.067691597532546e+02 6.343721766028853e+01 6.858194366252209e+01 5.169032688431987e+01 + 4 6.255741311782837e+02 -1.271462240130817e+02 5.209429149313172e+02 -3.221727963400119e+02 + 5 1.178894004938061e+02 -1.289984253435511e+01 3.583484571393804e+01 1.115677760109796e+02 + ME 2.372674068751716e-07 + +Event 129 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.455471401102499e+02 5.611838162748451e+01 -1.972996399049179e+02 1.349702832227317e+02 + 3 5.464479261367328e+02 3.973915006851445e+02 3.192093966153584e+02 -1.969535282111082e+02 + 4 3.628510278873775e+02 -2.872701579660765e+02 -1.541290438384744e+01 -2.211315607495373e+02 + 5 3.451539058656393e+02 -1.662397243465526e+02 -1.064968523265930e+02 2.831148057379137e+02 + ME 1.392545512414346e-07 + +Event 130 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.021391279871391e+02 7.662958214582308e+01 -5.633752971808341e+01 -3.723427656600568e+01 + 3 6.760797762746121e+02 2.243389930768461e+02 2.520516979585487e+02 -5.858547786175416e+02 + 4 4.648353497198455e+02 -1.876583671396737e+02 -1.546190406153393e+02 3.961681358600594e+02 + 5 2.569457460184037e+02 -1.133102080829954e+02 -4.109512762512610e+01 2.269209193234879e+02 + ME 8.190248798531356e-04 + +Event 131 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.920919318876586e+02 7.263675942251076e+01 -2.088562075937370e+02 3.237886174366762e+02 + 3 5.458102506861221e+02 3.135522126877631e+02 4.454057185571284e+02 -3.475033164249447e+01 + 4 5.478401415085880e+02 -3.922090216845467e+02 -2.494405778346952e+02 -2.899660360919773e+02 + 5 1.425767591763096e+01 6.020049574273062e+00 1.289106687130388e+01 9.277502977958795e-01 + ME 9.978073576505814e-06 + +Event 132 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.255682887750994e+02 -1.020202862582230e+01 -3.071270560190278e+02 4.263694597869200e+02 + 3 1.822944083831003e+02 -6.223997393813503e+01 7.452404233473395e+01 1.542841666731088e+02 + 4 3.498626937077319e+02 -7.824785520990129e+01 -9.684804908206219e+01 -3.269581517449581e+02 + 5 4.422746091340672e+02 1.506898577738587e+02 3.294510627663556e+02 -2.536954747150707e+02 + ME 2.981826762802143e-06 + +Event 133 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.629662388842298e+02 7.991123416805507e+01 -7.432810167831423e+01 3.461705049890880e+02 + 3 2.519974118268759e+02 4.092829737955623e+01 1.045933848073266e+02 -2.255832305283377e+02 + 4 4.540092238244611e+02 2.067709404411446e+02 -7.816124526092358e+01 -3.965614369124107e+02 + 5 4.310271254644334e+02 -3.276104719887559e+02 4.789596213191121e+01 2.759741624516604e+02 + ME 1.113198017190941e-05 + +Event 134 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.371391815360341e+02 2.182303519531751e+02 2.447820570023349e+02 4.254124567961070e+02 + 3 4.475096572574562e+02 8.761259551627977e+01 -1.064978001639930e+02 -4.257313061143341e+02 + 4 1.024195877641454e+02 3.874643372548357e+01 8.545199625557882e+01 -4.106631426450081e+01 + 5 4.129315734423644e+02 -3.445893811949385e+02 -2.237362530939207e+02 4.138516358272791e+01 + ME 2.183072642098643e-06 + +Event 135 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.611715485115952e+02 1.353574756360551e+02 1.270899083363582e+02 1.836765819053572e+02 + 3 4.982087211139469e+02 3.182740341581292e+02 1.027522253756156e+02 -3.692635226434683e+02 + 4 3.610184850208989e+02 -3.443761350776684e+02 -1.457446762716320e+01 -1.073639092485802e+02 + 5 3.796012453535586e+02 -1.092553747165158e+02 -2.152676660848105e+02 2.929508499866913e+02 + ME 3.714656407574500e-07 + +Event 136 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.165975957141717e+02 4.071301662611801e+02 -7.016168908300483e+01 -5.362762217357892e+01 + 3 5.429674882429235e+02 -4.587193013588839e+02 2.631983131614909e+02 1.229509812835685e+02 + 4 2.969286679985584e+02 3.813185169859016e+01 -7.853062280176133e+01 -2.838054563390252e+02 + 5 2.435062480443467e+02 1.345728339911383e+01 -1.145060012767249e+02 2.144820972290356e+02 + ME 2.382499893289027e-08 + +Event 137 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.164799518089250e+02 4.078481304962671e+02 7.086625433408588e+01 -4.575398022162617e+01 + 3 4.119129224428050e+02 1.032217184413983e+01 -1.307030352153062e+02 3.904899807028329e+02 + 4 4.185485375125856e+02 -3.640780696725426e+02 -1.159009525286995e+02 -1.708713160257732e+02 + 5 2.530585882356848e+02 -5.409223266786427e+01 1.757377334099197e+02 -1.738646844554335e+02 + ME 3.932295452856433e-08 + +Event 138 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.328295202229158e+02 -1.377487194047889e+01 -1.873512771356643e+02 1.375475821157613e+02 + 3 6.673431588212470e+02 -4.246598188414665e+02 4.577416587064384e+02 -2.355493658911680e+02 + 4 4.060704879661815e+02 3.031583118816545e+02 -2.670840197200184e+02 -4.067438437585884e+01 + 5 1.937568329896558e+02 1.352763789002910e+02 -3.306361850755684e+00 1.386761681512658e+02 + ME 1.273594564124628e-07 + +Event 139 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.641866696514710e+02 9.276406442189763e+01 3.425073851357022e+02 2.992537113580343e+02 + 3 4.084681019724289e+02 -1.718164587475395e+02 -6.269563677788108e+01 -3.652321891091058e+02 + 4 3.586483593020715e+02 2.437331083193391e+02 -1.380106438398361e+02 2.239997315310333e+02 + 5 2.686968690740283e+02 -1.646807139936973e+02 -1.418011045179850e+02 -1.580212537799619e+02 + ME 5.495957845857057e-09 + +Event 140 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.077745749323772e+02 -2.717749029281370e+02 -4.488259523209480e+02 -3.067305015739836e+02 + 3 3.836748848621314e+02 3.299395351490867e+02 1.859302801942361e+02 -6.145121094386256e+01 + 4 1.356054338659603e+02 5.736453748498842e+00 -2.020987384238928e+01 1.339682342633558e+02 + 5 3.729451063395315e+02 -6.390108596944859e+01 2.831055459691011e+02 2.342134782544906e+02 + ME 4.372848154299820e-07 + +Event 141 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.541592345202995e+02 5.064743234660293e+01 1.008919682868332e+02 -3.356849888331461e+02 + 3 5.737033413285019e+02 4.649244927868489e+02 -2.879027536526492e+02 1.734726038819985e+02 + 4 2.491572537229339e+02 -2.269136202053799e+02 7.415864097377921e+01 7.134453039277231e+01 + 5 3.229801704282646e+02 -2.886583049280719e+02 1.128521443920367e+02 9.086785455837524e+01 + ME 2.819141782405158e-03 + +Event 142 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.672503466682872e+02 -5.085474185662209e+02 -2.883041300906012e+01 4.310120131554450e+02 + 3 3.301134862993139e+02 3.214141834727916e+02 -2.596291776574529e+01 -7.066656493912240e+01 + 4 4.453287536751282e+02 2.428052104961861e+02 6.644020670411230e+01 -3.673540901349277e+02 + 5 5.730741335727041e+01 -5.567197540275679e+01 -1.164687592930690e+01 7.008641918605127e+00 + ME 1.230100599570473e-05 + +Event 143 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.961372424533696e+02 3.096839815776747e+02 -7.461011777250016e+01 3.803699846588919e+02 + 3 3.532673160900626e+02 -2.702239200247365e+02 1.096988588437250e+02 -1.993564396560228e+02 + 4 2.470296765764949e+02 -1.247422515683724e+02 9.161572144761085e+01 1.925346497822518e+02 + 5 4.035657648800728e+02 8.528219001543420e+01 -1.267044625188357e+02 -3.735481947851208e+02 + ME 1.071211155489203e-08 + +Event 144 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.484370908485413e+02 6.681262790792989e+01 4.372567917003316e+01 2.352554480611282e+02 + 3 1.333566163493900e+02 7.639041904632977e+01 -1.037373442449629e+02 -3.445365599013519e+01 + 4 6.759161690057646e+02 -2.745699937733856e+02 -8.241738839859467e+00 -6.175808123505842e+02 + 5 4.422901237963047e+02 1.313669468191260e+02 6.825340391478923e+01 4.167790202795914e+02 + ME 1.211591603208016e+00 + +Event 145 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.699419653731483e+02 -1.778511711731030e+02 -1.995508520752477e+02 3.764416313210161e+01 + 3 5.364082066653290e+02 -3.670657833361443e+02 9.343822714167085e+01 -3.798233439249868e+02 + 4 4.985199767048500e+02 3.642791687227051e+02 3.239621342782176e+01 3.387821420382697e+02 + 5 1.951298512566719e+02 1.806377857865423e+02 7.371641150575518e+01 3.397038754615278e+00 + ME 9.734793093096349e-08 + +Event 146 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.202675651747352e+02 -2.044738957939417e+02 6.060766212155786e+01 -5.509026682648761e+01 + 3 3.666300263497007e+02 1.440597763151976e+02 1.820522147186127e+02 -2.837628379214685e+02 + 4 2.848419345521625e+02 -2.522934903196281e+02 1.041889412010833e+02 -8.140999295636458e+01 + 5 6.282604739234019e+02 3.127076097983722e+02 -3.468488180412539e+02 4.202630977043208e+02 + ME 4.879534947396966e-08 + +Event 147 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.466917873161715e+02 -8.985124093282325e+01 1.062886307955967e+02 -4.634611021911086e+01 + 3 5.083338423321477e+02 -4.344326257841562e+02 1.725039954100384e+02 1.997847854117565e+02 + 4 1.758983079855465e+02 -2.529732669417202e+01 1.025981972445037e+02 -1.406195930035513e+02 + 5 6.690760623661345e+02 5.495811934111517e+02 -3.813908234501387e+02 -1.281908218909440e+01 + ME 1.131670131763126e-07 + +Event 148 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.170646172696435e+02 2.327689577119409e+02 -1.156815006470080e+02 -1.815664458486163e+02 + 3 3.480509574692186e+02 -2.600917732635421e+02 2.279827485885898e+02 3.893076960269328e+01 + 4 4.899885869304755e+02 -6.865708088891657e+01 -4.361073402046575e+02 2.125685968941701e+02 + 5 3.448958383306617e+02 9.597989644051778e+01 3.238060922630756e+02 -6.993292064824705e+01 + ME 7.934339568893859e-09 + +Event 149 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.565332013646124e+02 1.739097965881151e+01 1.427314035696600e+02 6.187037571403586e+01 + 3 1.287425867544280e+02 -3.692794701983706e+01 1.058940407929810e+02 6.322525205688025e+01 + 4 6.163211995179090e+02 3.680162870870676e+02 -3.480595054229865e+02 3.510988665583380e+02 + 5 5.984030123630489e+02 -3.484793197260416e+02 9.943406106034590e+01 -4.761944943292547e+02 + ME 9.629479157906087e-06 + +Event 150 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.336561333256778e+02 1.834876654089863e+02 -2.602841456593769e+01 -1.423024487443929e+02 + 3 1.741966417859597e+02 -7.277820045868700e+01 4.362439920231233e+01 -1.521338730912299e+02 + 4 6.077110042555054e+02 2.904143808356973e+02 1.411750271757575e+02 5.148220700387442e+02 + 5 4.844362206328570e+02 -4.011238457859966e+02 -1.587710118121322e+02 -2.203857482031216e+02 + ME 3.485580444106337e-07 + +Event 151 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.211746741680918e+02 -1.040856261513666e+02 -3.937271548256161e+01 -4.795073893018128e+01 + 3 2.923110311840379e+02 1.033353099237875e+01 -2.569705782408390e+02 -1.389427184406362e+02 + 4 6.933360159289849e+02 4.046737014260277e+02 5.364619473139212e+02 1.707706223054521e+02 + 5 3.931782787188855e+02 -3.109216062670398e+02 -2.401186535905205e+02 1.612283506536545e+01 + ME 4.663579598060312e-07 + +Event 152 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.221790270328912e+02 -1.025810599253255e+02 -2.803205211746588e+02 -1.212305943762748e+02 + 3 2.743866059556542e+02 -1.673110927218417e+02 2.121850586617832e+02 4.767083658571897e+01 + 4 5.652268340158244e+02 5.594619569444935e+02 -7.688643394315451e+01 -2.392005223436809e+01 + 5 3.382075329956307e+02 -2.895698042973263e+02 1.450218964560301e+02 9.747981002492391e+01 + ME 4.521908145616384e-09 + +Event 153 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.181242749149867e+02 -1.496553803635585e+02 3.218925844259840e+02 2.209442022162673e+02 + 3 2.742594598422451e+02 -1.018045508856852e+02 -1.318386065039126e+02 2.178822309575300e+02 + 4 3.466792897228671e+02 -1.619951176147567e+02 -1.965669238598627e+02 -2.351713337888010e+02 + 5 4.609369755199013e+02 4.134550488640003e+02 6.512945937791542e+00 -2.036550993849962e+02 + ME 6.406934045235203e-08 + +Event 154 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.415097258757403e+02 -2.050410225971519e+02 -1.222122769468913e+02 3.673263002217224e+01 + 3 2.736230574879775e+02 -2.193782751409555e+02 1.166795472694783e+02 1.145802480108935e+02 + 4 4.673057057314866e+02 1.745358467961839e+01 1.069490661195013e+02 -4.545678081949938e+02 + 5 5.175615109047959e+02 4.069657130584891e+02 -1.014163364420883e+02 3.032549301619279e+02 + ME 7.300744359937460e-08 + +Event 155 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.774233833471055e+02 -3.813733771182958e+02 -7.748207752183868e+01 2.765573391810157e+02 + 3 2.785548789953623e+02 5.565153575418105e+01 1.962114983217827e+02 -1.897281610795908e+02 + 4 3.469368907421938e+02 -2.485484467451775e+00 -3.410354529311986e+02 -6.366983876872376e+01 + 5 3.970848469153383e+02 3.282073258315664e+02 2.223060321312547e+02 -2.315933933270110e+01 + ME 9.358201900073783e-09 + +Event 156 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.830763202836453e+02 1.670390722374144e+02 -4.839353367414558e+02 2.790744228239374e+02 + 3 8.372937424277062e+01 -2.206118779177231e+01 -7.862990681346045e+01 1.847294938034042e+01 + 4 6.517262335322765e+02 -1.517360023471754e+02 4.537037704279378e+02 -4.425789847759814e+02 + 5 1.814680719413077e+02 6.758117901533360e+00 1.088614731269784e+02 1.450316125717036e+02 + ME 1.040311123128328e-06 + +Event 157 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.221466021800015e+02 1.004197184024976e+02 -1.831200458211014e+01 -1.973060148709808e+02 + 3 1.952944116425906e+02 5.947164367963718e+01 -1.637405152185915e+02 8.827272791082248e+01 + 4 5.043036849209495e+02 1.030300241331701e+02 2.995278680472508e+02 -3.924156941497569e+02 + 5 5.782553012564586e+02 -2.629213862153049e+02 -1.174753482465492e+02 5.014489811099153e+02 + ME 2.541361172057728e-08 + +Event 158 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.014796349232532e+02 6.324463617576040e+01 -5.143472614980268e+01 2.902491261215933e+02 + 3 5.569806323774446e+02 -2.837656127180175e+01 -3.396072279214706e+02 -4.405554747786710e+02 + 4 2.957632477700599e+02 2.349299171863054e+01 2.481969354528773e+02 -1.591296933428909e+02 + 5 3.457764849292423e+02 -5.836106662258911e+01 1.428450186183959e+02 3.094360419999686e+02 + ME 5.423333431647977e-05 + +Event 159 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.591538548774634e+02 -2.182240714207247e+02 -4.209612865776884e+01 -1.332925020488410e+02 + 3 6.478059984032089e+02 2.970279686703716e+02 4.046570960198609e+02 4.094870352471539e+02 + 4 2.116191389359584e+02 9.445193097144917e+01 -1.887137862457113e+02 1.576704095793049e+01 + 5 3.814210077833698e+02 -1.732558282210961e+02 -1.738471811163809e+02 -2.919615741562435e+02 + ME 4.258838904318967e-07 + +Event 160 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.355516328348967e+02 4.598601326351700e+02 1.086179643103354e+02 -2.520840091595789e+02 + 3 1.250913851056828e+02 2.412231230375017e+01 9.532685281942435e+01 -7.732244052228563e+01 + 4 1.403519283462578e+02 1.060977550664594e+02 6.322679457710814e+01 6.666560288390733e+01 + 5 6.990050537131629e+02 -5.900802000053795e+02 -2.671716117068678e+02 2.627408467979571e+02 + ME 3.157974474695796e-07 + +Event 161 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.037519486668241e+02 -4.556894118761883e+02 -3.789671218007896e+02 1.150977672109518e+02 + 3 1.320163331334138e+02 -1.009130349982057e+02 -4.400086191303104e+01 7.286148318799006e+01 + 4 4.535812871805164e+02 3.214261890918493e+02 2.240876447507701e+02 -2.284861407531699e+02 + 5 3.106504310192461e+02 2.351762577825446e+02 1.988803389630505e+02 4.052689035422792e+01 + ME 3.757369932492492e-06 + +Event 162 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.806951845247627e+02 2.263617881792713e+02 1.983837182497826e+02 -2.330945402750066e+02 + 3 6.226282282350211e+02 -5.665599382822433e+02 -1.169170394997212e+02 2.302306513115117e+02 + 4 1.991674245974932e+02 8.131880151281560e+01 -1.065651751520856e+02 1.473050541771148e+02 + 5 2.975091626427234e+02 2.588793485901565e+02 2.509849640202416e+01 -1.444411652136197e+02 + ME 4.258978540792548e-07 + +Event 163 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.775491654024494e+02 -2.389339247301170e+02 -3.569712417423736e+01 -2.901373247874839e+02 + 3 4.057858154423257e+02 1.211038531074871e+02 -1.075956188633677e+02 3.720472652474713e+02 + 4 3.781881739343797e+02 -1.691066352891619e+02 3.211864194868994e+02 -1.061533077307745e+02 + 5 3.384768452208452e+02 2.869367069117919e+02 -1.778936764492943e+02 2.424336727078715e+01 + ME 1.225821424327284e-08 + +Event 164 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.565339415542259e+02 -2.071226667867270e+02 -3.991710195847456e+02 7.865073310444370e+01 + 3 1.850572037534592e+02 -1.814377768563993e+02 3.634341818882847e+01 2.378601439647877e+00 + 4 1.921815529404395e+02 -8.912051195266237e+01 -2.935060714794935e+01 1.677194845566282e+02 + 5 6.662273017518758e+02 4.776809555957887e+02 3.921782085438665e+02 -2.487488191007194e+02 + ME 8.830907796004414e-08 + +Event 165 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.608706475726781e+02 -7.352587451384150e+01 2.926483329335018e+01 1.400602747381999e+02 + 3 4.368533973739729e+02 5.240491187105326e+01 3.687621041233544e+02 2.282742354526359e+02 + 4 6.868232300507626e+02 -1.068225318099977e+02 -4.089841618100872e+02 -5.413382042896707e+02 + 5 2.154527250025866e+02 1.279434944527859e+02 1.095722439338236e+01 1.730036940988348e+02 + ME 2.214163213888219e-06 + +Event 166 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.403514225384916e+02 -4.364754696013563e+02 2.109194731869347e+00 5.825882821969267e+01 + 3 1.069411967678109e+02 -8.113600881769635e+01 2.298926129200185e+01 6.576367921972141e+01 + 4 6.275848674095952e+02 5.117980899513775e+02 2.830058390250813e+02 -2.276690053594267e+02 + 5 3.251225132841022e+02 5.813388467675122e+00 -3.081042950489525e+02 1.036464979200126e+02 + ME 2.984603572739545e-07 + +Event 167 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.918955211023958e+02 -1.623891733199485e+02 -1.011860556445403e+02 -1.466388593530866e+01 + 3 1.871901284559375e+02 -7.878242912860121e+01 -1.694437551331697e+02 -1.105834066746689e+01 + 4 6.102188811441906e+02 -1.343591330747666e+02 5.476768981515617e+02 2.331624358582424e+02 + 5 5.106954692974763e+02 3.755307355233163e+02 -2.770470873738516e+02 -2.074402092554669e+02 + ME 3.397865995504970e-07 + +Event 168 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.216787680814041e+02 -1.189606477208519e+02 1.547783496418446e+02 1.050404826439369e+02 + 3 5.126462182586654e+02 -1.590819643619220e+02 4.209999834531878e+02 2.454752281638187e+02 + 4 4.272224542767368e+02 7.779140932193557e+01 -3.932611112348422e+02 -1.476929939580156e+02 + 5 3.384525593831933e+02 2.002512027608385e+02 -1.825172218601902e+02 -2.028227168497398e+02 + ME 2.715202851344253e-05 + +Event 169 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.698429879806192e+02 1.455074202745654e+02 2.183111666053455e+01 -4.462100697484889e+02 + 3 6.804356369583268e+02 -3.541873855898969e+02 2.380635543828890e+01 5.804973810213298e+02 + 4 2.494072380391387e+02 2.129062287565781e+02 5.332877252319550e+01 -1.184523117720059e+02 + 5 1.003141370219151e+02 -4.226263441246972e+00 -9.896624462201893e+01 -1.583499950083470e+01 + ME 1.016300919548716e-06 + +Event 170 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.947525114000200e+02 -2.120980687331741e+02 -5.339573876161243e+02 1.537350554247554e+02 + 3 5.304186438182739e+02 9.762854658278448e+01 4.210999845330488e+02 -3.073880408025274e+02 + 4 2.725467755647544e+02 1.192808703849826e+02 2.108782617781282e+02 1.248366033676854e+02 + 5 1.022820692169514e+02 -4.811348234593062e+00 -9.802085869505261e+01 2.881638201008670e+01 + ME 6.536648372769181e-06 + +Event 171 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.224797308226669e+02 -5.375302845466814e+02 -1.836636776324524e+01 3.133765865782073e+02 + 3 4.124091288166618e+02 2.382619823047310e+02 3.210619006365506e+01 -3.350846309205101e+02 + 4 1.838688075380518e+02 8.727789435881400e+01 1.413249053618683e+02 -7.885162437271516e+01 + 5 2.812423328226197e+02 2.119904078831364e+02 -1.550647276622781e+02 1.005596687150179e+02 + ME 2.229289818847732e-07 + +Event 172 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.024481242448970e+02 1.796358471701274e+01 -1.746853017443778e+02 1.007352867115970e+02 + 3 5.859822858732682e+02 -2.176592218037537e+01 3.060477525768988e+02 4.992356729356663e+02 + 4 6.608756657583934e+02 2.436588734248681e+01 -1.382800479300340e+02 -6.457875637313563e+02 + 5 5.069392412344163e+01 -2.056354987912417e+01 6.917597097512822e+00 4.581660408409283e+01 + ME 1.416746548334385e-05 + +Event 173 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.801691995127820e+02 -4.215202314930910e+02 3.958855854346894e+02 -4.681450363238424e+01 + 3 2.617744476703427e+02 1.599633143319359e+02 -6.847827136272012e+01 -1.955717921179559e+02 + 4 2.895562615833315e+02 -6.776676019363940e+01 -1.625404652143166e+02 2.298501511917841e+02 + 5 3.685000912335435e+02 3.293236773547945e+02 -1.648668488576526e+02 1.253614455855600e+01 + ME 1.421424450431506e-08 + +Event 174 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.432224768586810e+02 2.123358321375093e+02 -9.137617595590747e+01 2.537036804604031e+02 + 3 2.662389127170579e+02 5.658820252344638e+01 1.140858201862947e+02 2.338062437411521e+02 + 4 6.440666757949750e+02 -7.559162042568607e+01 -7.005998003003399e+00 -6.395769740875528e+02 + 5 2.464719346292864e+02 -1.933324142352695e+02 -1.570364622738399e+01 1.520670498859977e+02 + ME 9.459585443973610e-07 + +Event 175 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.700131770443521e+02 1.537969471106570e+02 -2.117531928678223e+02 -6.643944722308586e+01 + 3 4.962752034245966e+02 2.128838609308023e+02 -7.929139726182321e+01 4.412283010070720e+02 + 4 2.542695012796544e+02 6.381377149808638e+00 2.420365821040695e+02 -7.765661742605151e+01 + 5 4.794421182513966e+02 -3.730621851912678e+02 4.900800802557609e+01 -2.971322363579344e+02 + ME 4.233165528172994e-08 + +Event 176 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.595250760942820e+02 -3.781994799062178e+02 -1.953391788690975e+02 -3.631474485325831e+02 + 3 2.822193451057853e+02 2.531285812406774e+02 1.162602688787051e+02 -4.535669731441308e+01 + 4 3.615103193569824e+02 1.649061800696584e+00 -1.746256934103365e+02 3.165325556680135e+02 + 5 2.967452594429505e+02 1.234218368648438e+02 2.537046034007288e+02 9.197159017898271e+01 + ME 7.642284594467364e-08 + +Event 177 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.396956142606758e+02 -4.105423218103476e+01 1.309817345502700e+02 2.594609561949185e+01 + 3 5.462636835449130e+02 1.226445420924370e+02 -2.117083223539773e+02 -4.884075291199530e+02 + 4 5.587261113115388e+02 1.501945737199239e+02 1.776131549773776e+02 5.080059297541580e+02 + 5 2.553145908828726e+02 -2.317848836313263e+02 -9.688656717367031e+01 -4.554449625369679e+01 + ME 2.391077630198967e-07 + +Event 178 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.088496927414232e+02 -1.341610898283949e+02 -9.277802989315123e+01 3.749014675884305e+02 + 3 3.588364211022619e+02 3.077768962673022e+02 -8.016131802617933e+01 -1.661659481538016e+02 + 4 4.878368684282141e+02 -9.295843092916157e+01 3.953926925341257e+02 -2.702002202213190e+02 + 5 2.444770177281012e+02 -8.065737550974578e+01 -2.224533446147952e+02 6.146470078669034e+01 + ME 8.502571044762264e-08 + +Event 179 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.795210385243066e+02 4.384022819240557e+02 2.074574650990888e+02 -4.759345345481805e+02 + 3 4.130397129911726e+02 -1.166933801910929e+02 -8.188519392843642e+01 3.876587088441451e+02 + 4 1.323830995986130e+02 -9.020264614934969e+01 2.283949186981634e+01 9.416541455396502e+01 + 5 2.750561488859077e+02 -2.315062555836131e+02 -1.484117630404687e+02 -5.889588849929629e+00 + ME 9.818726700017822e-06 + +Event 180 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.390143503225903e+02 4.332330366673434e+02 3.187886675509226e+02 -3.491978285705225e+01 + 3 2.121543783556901e+02 -1.539133636629621e+02 -6.982443694241508e+01 -1.282369086773257e+02 + 4 3.525195899359830e+02 3.116523503389996e+01 -3.453504862598982e+02 -6.349670112665112e+01 + 5 3.963116813857368e+02 -3.104849080382812e+02 9.638625565139064e+01 2.266533926610290e+02 + ME 1.561098728950639e-08 + +Event 181 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.871910159031975e+02 -2.482278605801525e+02 -4.059757113335109e+02 1.045071146661306e+02 + 3 3.939631927235100e+02 3.843058254746379e+02 -6.751539482803386e+01 -5.438475143248536e+01 + 4 1.320318755627035e+02 1.173355086182817e+02 5.989553638691459e+01 -8.793139531371171e+00 + 5 4.868139158105894e+02 -2.534134735127670e+02 4.135955697746302e+02 -4.132922370227408e+01 + ME 1.157324702747829e-07 + +Event 182 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.634643361350383e+02 6.329356748624852e+01 1.470860510178153e+02 -3.286650420895992e+01 + 3 5.024217881602822e+02 -4.023414251461463e+02 -1.657157218704947e+02 2.511719139443823e+02 + 4 3.341400639218183e+02 -1.066841963158697e+01 3.309585949660342e+02 -4.474567642073620e+01 + 5 4.999738117828628e+02 3.497162772914849e+02 -3.123289241133547e+02 -1.735597333146861e+02 + ME 1.000511008264552e-08 + +Event 183 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.995054708334443e+02 -8.618442988121295e+01 -1.317716324156771e+02 -4.740402897740854e+02 + 3 3.459503058168963e+02 2.817467670894948e+02 -1.997611117656004e+02 1.989652119324885e+01 + 4 9.946167962959554e+01 4.869462143288413e+01 -5.138636892793455e+00 8.657397974641351e+01 + 5 5.550825437200641e+02 -2.442569586411661e+02 3.366713810740709e+02 3.675697888344230e+02 + ME 5.896827781127867e-07 + +Event 184 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.245304000524380e+02 -3.884223434549884e+02 -1.908539669459332e+02 2.963696795981436e+02 + 3 2.116914247222055e+02 -1.518308824598570e+02 -9.307419176861659e+01 1.144457830546720e+02 + 4 1.497828062091506e+02 5.152418898782463e+01 3.289577420345209e+01 -1.367406853302644e+02 + 5 6.139953690162059e+02 4.887290369270207e+02 2.510323845110977e+02 -2.740747773225511e+02 + ME 7.676964070009145e-03 + +Event 185 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.823567631295894e+02 -3.287439770113579e+02 -1.408601587358858e+02 -1.352128232476263e+02 + 3 2.729601432779185e+02 1.050401758721174e+02 1.518250728524651e+02 2.010545909060525e+02 + 4 4.415844326987421e+02 1.841356276793078e+02 -4.001213558867116e+02 3.152431417805643e+01 + 5 4.030986608937497e+02 3.956817345993269e+01 3.891564417701323e+02 -9.736608183648251e+01 + ME 6.903506453576635e-09 + +Event 186 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.441943751899469e+02 -3.172061664525860e+01 -2.165035809528642e+02 -1.084015442170437e+02 + 3 1.531216649467744e+02 -1.072415316939105e+02 -6.400357868200553e+01 -8.859480838013359e+01 + 4 6.690842740855497e+02 -1.683192937014952e+02 5.498056012764863e+02 3.421347424647482e+02 + 5 4.335996857777266e+02 3.072814420406639e+02 -2.692984416416154e+02 -1.451383898675704e+02 + ME 2.641658975633361e-07 + +Event 187 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.527752494388041e+02 -4.473442706504163e+02 6.157878466879122e+01 -3.311469914302317e+01 + 3 4.846819923263979e+02 1.808338503617970e+02 -3.063074014757466e+02 3.292286865538744e+02 + 4 4.767615702494120e+02 2.297431564813573e+02 3.180144106698235e+02 -2.708994491005699e+02 + 5 8.578118798538661e+01 3.676726380726195e+01 -7.328579386286809e+01 -2.521453831028136e+01 + ME 4.220499843885650e-08 + +Event 188 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.763744608693619e+02 1.879714650814332e+02 1.531545296300208e+02 -1.326395905820623e+02 + 3 1.675411521312716e+02 -1.163100541390447e+02 6.574161087985149e+00 -1.204109188140162e+02 + 4 6.612223670090391e+02 1.024684272345772e+02 -3.695690310853153e+02 5.386408555946808e+02 + 5 3.948620199903277e+02 -1.741298381769656e+02 2.098403403673093e+02 -2.855903461986022e+02 + ME 9.494482117632271e-08 + +Event 189 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.783469474210132e+02 -8.658737187334013e+01 1.356909096492870e+02 4.504513360815113e+02 + 3 4.456648861487530e+01 -2.142659180653914e+01 -3.868877590386524e+01 -5.500153641150110e+00 + 4 7.225573371875209e+02 1.692206577779368e+02 -1.495772879551335e+02 -6.863527587419684e+02 + 5 2.545292267765901e+02 -6.120669409805739e+01 5.257515420971162e+01 2.414015763016072e+02 + ME 2.011575952257779e-01 + +Event 190 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.849447494768668e+02 4.417222124371037e+02 3.063930417948522e+02 -2.305756945558716e+02 + 3 8.747756552706328e+01 -7.131945592982622e+01 5.022797529361168e+01 -6.558214263010734e+00 + 4 6.441296778117609e+02 -3.837545514381533e+02 -5.018072724050934e+02 1.257972473854055e+02 + 5 1.834480071843092e+02 1.335179493087577e+01 1.451862553166296e+02 1.113366614334768e+02 + ME 7.067046401190182e-08 + +Event 191 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.324822165476273e+02 -6.253195280175738e+02 7.607762994717298e+01 -5.675769860115388e+01 + 3 9.968007454278923e+01 -5.919441699588452e+01 6.439562903151095e+01 4.780524258916985e+01 + 4 5.616099301916483e+01 -9.982478114887614e+00 1.074288744286909e+00 -5.525625006529460e+01 + 5 7.116767158904186e+02 6.944964231283459e+02 -1.415475477229708e+02 6.420870607727863e+01 + ME 2.481593467967045e-06 + +Event 192 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.369395837007374e+02 -4.355765469733402e+00 2.422016200467208e+02 -3.636421350153063e+02 + 3 3.489363876291524e+02 2.257513727555906e+02 -9.443187595205370e+01 -2.487479469559988e+02 + 4 1.988200442993521e+02 -6.004084227260262e+01 -1.441955006989127e+02 1.230128645816471e+02 + 5 5.153039843707588e+02 -1.613547650132546e+02 -3.574243395754780e+00 4.893772173896579e+02 + ME 3.947295067855795e-06 + +Event 193 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.952170711629280e+02 -2.448161671907030e+02 6.058136449560857e+01 -1.534537769620588e+02 + 3 4.390044396065629e+02 -3.923286441491609e+02 -3.710622564906000e+01 -1.934586803233041e+02 + 4 4.491243422525341e+02 3.349194872100991e+02 5.492135841287628e+01 2.941517572287814e+02 + 5 3.166541469779748e+02 3.022253241297647e+02 -7.839649725942485e+01 5.276070005658146e+01 + ME 8.794765507913085e-06 + +Event 194 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.033148328845124e+02 3.807229676197026e+02 4.722829680298763e+01 -1.244281492773959e+02 + 3 3.402574757440812e+02 -4.112898360893274e+01 3.368414662709729e+02 -2.492755718980818e+01 + 4 3.318299773052011e+02 6.453189644180645e+01 -3.237215919515893e+02 3.392785115982095e+01 + 5 4.245977140662055e+02 -4.041258804525763e+02 -6.034817112237117e+01 1.154278553073833e+02 + ME 5.247187342747204e-09 + +Event 195 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.009160799820447e+02 1.127718590509784e+02 2.578905406128325e+02 1.064211635924338e+02 + 3 5.195297064167322e+02 3.024829678234382e+02 7.496264536843978e+01 -4.156871080819640e+02 + 4 2.228454712975529e+02 -1.162059794317317e+02 -1.900820329549402e+02 -5.009507945643895e+00 + 5 4.567087423036699e+02 -2.990488474426849e+02 -1.427711530263320e+02 3.142754524351741e+02 + ME 7.937827288921181e-08 + +Event 196 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.865816169777639e+02 -2.082718992755388e+02 -1.168755264994778e+02 3.039872922718519e+02 + 3 4.950634710774064e+02 4.481345969831610e+02 -1.338922043908974e+00 2.103840076377247e+02 + 4 5.009936462013424e+02 -2.363057974813166e+02 2.672200856348095e+01 -4.409536686196952e+02 + 5 1.173612657434876e+02 -3.556900226305540e+00 9.149243997990575e+01 -7.341763128988138e+01 + ME 1.861814818910771e-06 + +Event 197 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.200952246637105e+01 -4.092854273592183e+01 4.451595293266989e+01 1.372462023217837e+01 + 3 2.631639131516304e+02 1.236751000292302e+02 -2.240103694280010e+02 6.147413445319324e+01 + 4 4.924430755200161e+02 6.566832142306471e+01 -3.082242226360131e+02 3.783988408670338e+02 + 5 6.823834888619817e+02 -1.484148787163730e+02 4.877186391313444e+02 -4.535975955524060e+02 + ME 2.789695690630115e-06 + +Event 198 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.104475683108870e+02 -1.076092727069166e+02 -1.243307094385727e+01 2.154828346252984e+01 + 3 4.258110791144966e+02 -1.061255031183437e+02 3.244981883629733e+02 2.544668513454730e+02 + 4 5.489837505506218e+02 -1.103287384131400e+02 -5.228663656420097e+02 -1.257835105612362e+02 + 5 4.147576020239940e+02 3.240635142384003e+02 2.108012482228936e+02 -1.502316242467664e+02 + ME 4.509429214892343e-08 + +Event 199 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.945214729162650e+02 3.938714004469919e+02 1.610598617964423e+01 1.590942208647365e+01 + 3 4.307182563216123e+02 5.676770947025447e+01 3.019258642913346e+02 -3.018880851609780e+02 + 4 6.132249141587976e+02 -4.789484991716052e+02 -3.037239430541908e+02 2.332485732045239e+02 + 5 6.153535660332505e+01 2.830938925435877e+01 -1.430790741678789e+01 5.273008986998033e+01 + ME 2.825916246701146e-07 + +Event 200 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.680965384249500e+02 -4.164985458027597e+00 3.568534147325901e+02 -9.018511457139309e+01 + 3 1.116595944155914e+02 -3.841516463813667e+01 1.308374745062447e+01 1.040238227692461e+02 + 4 5.078831428275769e+02 -3.645818734439150e+02 -2.334033354278479e+02 2.656091627480077e+02 + 5 5.123607243318817e+02 4.071620235400787e+02 -1.365338267553668e+02 -2.794478709458607e+02 + ME 2.955488344440729e-07 + +Event 201 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.403128193251625e+02 1.213134048128906e+02 -5.665569190553459e+01 4.196281275587439e+01 + 3 5.694904595392480e+02 2.782425801842742e+02 1.994110039347696e+02 4.551216338390749e+02 + 4 4.865766137247840e+02 -3.320140599055400e+02 -2.710610759019411e+02 -2.303244628340644e+02 + 5 3.036201074108052e+02 -6.754192509162509e+01 1.283057638727062e+02 -2.667599837608849e+02 + ME 4.342531742224749e-07 + +Event 202 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.648494953542230e+02 1.475915136963884e+02 -2.626638545079027e+02 -2.057658836641568e+02 + 3 4.772142399752357e+02 -2.858814296920360e+02 -2.698032465892513e+02 2.705761392330796e+02 + 4 3.044384819161785e+02 -9.936314325722020e+01 2.866772864774831e+02 2.501776271730769e+01 + 5 3.534977827543630e+02 2.376530592528678e+02 2.457898146196709e+02 -8.982801828623059e+01 + ME 1.379086985977752e-08 + +Event 203 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.932094880517371e+01 1.034008155560684e+01 3.499022683723039e+01 1.465959597811796e+01 + 3 2.173576252233973e+02 7.202316342302498e+01 2.050015172626940e+02 -5.601704498491753e+00 + 4 5.734773652773413e+02 -2.838136097561230e+02 1.954238788160289e+02 4.584055311524264e+02 + 5 6.698440606940879e+02 2.014503647774911e+02 -4.354156229159535e+02 -4.674634226320526e+02 + ME 7.600729941779242e-05 + +Event 204 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.908212834725718e+02 2.977981722293325e+02 4.775358544068218e+02 -1.798486735480492e+02 + 3 4.393058230892194e+02 -4.077733598319210e+02 -1.229004842814027e+02 -1.077309805714206e+02 + 4 7.659681746343533e+01 2.011886749910356e+01 -6.038949216792007e+01 4.260766189056932e+01 + 5 3.932760759747735e+02 8.985632010348482e+01 -2.942458779574990e+02 2.449719922289005e+02 + ME 3.370372723906801e-03 + +Event 205 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.981880635358689e+02 2.184458843478823e+02 1.617999572875585e+02 2.909568754569331e+02 + 3 3.216615163919834e+02 -4.871917465073474e+01 3.144534008054081e+02 -4.702799029147249e+01 + 4 1.852586967674285e+02 1.516243001875516e+02 5.093442076969942e+00 -1.063245652161804e+02 + 5 5.948917233047179e+02 -3.213510098846989e+02 -4.813468001699369e+02 -1.376043199492806e+02 + ME 2.123639209037495e-08 + +Event 206 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.823899014055036e+02 1.390835876764512e+01 3.318403571700483e+01 1.788056304052877e+02 + 3 4.881690514074010e+02 5.489817729671770e+01 -2.514792759478631e+02 -4.147931853951296e+02 + 4 5.532771902400449e+02 -1.644494445187262e+02 1.888131751045239e+02 4.933777602855134e+02 + 5 2.761638569470507e+02 9.564290845436338e+01 2.948206512633433e+01 -2.573902052956715e+02 + ME 1.981520297289002e-08 + +Event 207 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.938861958251189e+02 2.992968080115893e+02 -3.402144065306870e+02 1.964666707857948e+02 + 3 3.044454630324483e+02 -3.028110458731303e+02 -2.191514037003863e+01 2.263265518723668e+01 + 4 2.202180359402747e+02 -1.939237559167995e+02 -5.680289179173083e+01 -8.753851568658551e+01 + 5 4.814503052021579e+02 1.974379937783406e+02 4.189324386924563e+02 -1.315608102864460e+02 + ME 1.372852518162136e-07 + +Event 208 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.898839499787491e+02 2.406651709414215e+01 -5.533524123801432e+01 2.835339808336385e+02 + 3 2.168122352910971e+02 1.569806903165401e+02 -1.452119627527829e+02 3.575044213920987e+01 + 4 3.762784182507863e+02 3.600471233079040e+02 -8.530266455519371e+00 -1.089896857194042e+02 + 5 6.170253964793671e+02 -5.410943307185863e+02 2.090774704463165e+02 -2.102947372534441e+02 + ME 1.886181196479343e-07 + +Event 209 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.204279908517810e+02 -2.514114625724550e+02 2.338847634961810e+01 1.972798848908106e+02 + 3 4.146640535559624e+02 1.131670399568331e+02 -3.642791244988921e+02 -1.626044828200972e+02 + 4 3.813647803536318e+02 -7.443149120704001e+00 1.658934600489188e+02 -3.433118919267706e+02 + 5 3.835431752386245e+02 1.456875717363259e+02 1.749971881003552e+02 3.086364898560572e+02 + ME 2.900961922725036e-07 + +Event 210 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.750782658901178e+02 6.150161642286470e+01 3.397425940367914e+02 1.465476936620454e+02 + 3 3.753847414636021e+02 3.522640303812288e+01 2.877708085052046e+02 2.384549568051627e+02 + 4 3.679623986676974e+01 -1.418616000697859e+01 -1.464605117059101e+01 -3.063020270415600e+01 + 5 7.127407527795104e+02 -8.254185945400896e+01 -6.128673513714048e+02 -3.543724477630520e+02 + ME 7.589673857092090e-04 + +Event 211 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.755660145440862e+02 -2.198570660497881e+01 3.262914547838685e+01 1.711005348715321e+02 + 3 3.842357993703558e+02 -1.193495468980239e+02 2.697221007833122e+02 2.462576364745863e+02 + 4 3.193436063508752e+02 3.079641211241163e+02 6.340526995545156e+01 5.584094160479156e+01 + 5 6.208545797346827e+02 -1.666288676211136e+02 -3.657565162171507e+02 -4.731991129509098e+02 + ME 8.184696619161020e-07 + +Event 212 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.286124870619989e+02 1.947671256067593e+02 2.039384905960800e+02 1.687039581318286e+02 + 3 2.534140104196739e+02 -1.089594202454935e+02 2.209716328642430e+02 5.931309202915251e+01 + 4 6.082520999117172e+02 1.448363181074745e+02 -4.014492274390981e+02 -4.333954035204287e+02 + 5 3.097214026066098e+02 -2.306440234687402e+02 -2.346089602122496e+01 2.053783533594477e+02 + ME 4.798047108164077e-08 + +Event 213 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.669993751556153e+02 -4.598496409826352e+02 -7.229659848051256e+01 3.237279737111812e+02 + 3 2.092334596630099e+02 2.048758411569909e+02 -3.638525763691985e+01 2.192358044350439e+01 + 4 4.459773655818122e+02 -9.956047386297890e+00 1.351965327017321e+02 -4.248747877636002e+02 + 5 2.777897995995633e+02 2.649298472119419e+02 -2.651467658429982e+01 7.922323360891463e+01 + ME 7.665958375339324e-08 + +Event 214 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.625339438015198e+02 3.062049422550726e+01 -5.616013724447026e+01 1.494178945298839e+02 + 3 5.337445472246161e+02 -5.164508962454078e+02 9.025334708476193e+01 1.000802018349427e+02 + 4 1.876106068022419e+02 -1.103443290851434e+02 7.041196210723921e+01 -1.344024717617341e+02 + 5 6.161109021716223e+02 5.961747311050439e+02 -1.045051719475309e+02 -1.150956246030925e+02 + ME 9.950371319687602e-08 + +Event 215 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.723879972712584e+02 2.234999350439451e+01 5.681683787000765e+02 6.567488603630498e+01 + 3 5.480386719020225e+02 2.169918591497191e+02 -4.867917566707426e+02 1.276507132776535e+02 + 4 1.691288617700992e+02 -6.631329221231215e+01 -1.308191347758221e+02 -8.422275901512093e+01 + 5 2.104444690566197e+02 -1.730285604418016e+02 4.944251274648833e+01 -1.091028402988376e+02 + ME 9.126574950869752e-08 + +Event 216 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.029822916324048e+02 2.312736480112498e+01 9.957621754525780e+01 -1.245207960932006e+01 + 3 5.228603101684607e+02 -4.839212819316630e+02 1.358805374086466e+02 -1.440124175083947e+02 + 4 6.978795308799911e+02 5.680185838524436e+02 -2.867570723039520e+02 2.866375926203801e+02 + 5 1.762778673191430e+02 -1.072246667219058e+02 5.130031735004758e+01 -1.301730955026653e+02 + ME 2.524082819455057e-07 + +Event 217 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.167243056528579e+02 1.524729497764623e+02 -3.092700529235155e+02 2.340153432427175e+02 + 3 2.420772334393018e+02 1.940876116374085e+02 1.438364686179607e+02 1.557100684050792e+01 + 4 2.560961474686904e+02 1.653992903559016e+02 -1.049098293922634e+02 -1.649916337120024e+02 + 5 5.851023134391495e+02 -5.119598517697725e+02 2.703434136978181e+02 -8.459471637122280e+01 + ME 1.934350903516664e-08 + +Event 218 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.191584476049009e+02 -2.558241509232280e+01 5.053737346676259e+02 1.160535310349098e+02 + 3 4.811621620037556e+02 2.736858157273282e+02 -3.947269007435494e+02 2.835091256357408e+01 + 4 3.658297349903178e+02 -3.118098839557957e+02 -2.678070504259062e+00 -1.913081786253101e+02 + 5 1.338496554010260e+02 6.370648332079029e+01 -1.079687634198174e+02 4.690373502682655e+01 + ME 1.053464420340619e-08 + +Event 219 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.056249374264289e+02 2.080046640325060e+01 -9.374879826474562e+00 2.043552473445915e+02 + 3 3.816677590094031e+02 -3.102683269227689e+02 -2.213707035898101e+02 -1.997135869083601e+01 + 4 4.652594638262416e+02 4.137417371803166e+02 5.174203782271513e+01 2.064144014279245e+02 + 5 4.474478397379267e+02 -1.242738766607982e+02 1.790035455935694e+02 -3.907982900816801e+02 + ME 2.769709221938521e-08 + +Event 220 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.509318149012606e+02 -2.007423466948322e+02 -1.490284238622257e+02 2.144329392218190e+01 + 3 6.855319233176400e+02 5.660933754166979e+02 2.752308514532210e+02 2.715516278817843e+02 + 4 3.841620255253093e+02 -3.525889260783146e+02 -9.880530052663001e+01 -1.161852987709352e+02 + 5 1.793742362557902e+02 -1.276210264355109e+01 -2.739712706436526e+01 -1.768096230330310e+02 + ME 1.025073862425414e-07 + +Event 221 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.326044260078135e+02 -1.267123948779659e+02 -3.623143164937092e+01 1.466922430768170e+01 + 3 4.838676388808952e+02 -8.498426264469489e+01 -3.969280642095542e+02 2.633508665293622e+02 + 4 4.068580325182036e+02 1.222413641068804e+02 3.430383967533347e+02 1.814253727535772e+02 + 5 4.766699025930878e+02 8.945529341578033e+01 9.012109910559042e+01 -4.594454635906212e+02 + ME 3.504069191306795e-07 + +Event 222 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.235249191113402e+02 5.899149719992189e+02 1.986167723905413e+02 -3.653803893380113e+01 + 3 1.389939159797710e+02 -2.651253992124959e+01 -1.209781885374506e+02 -6.309256536639514e+01 + 4 6.073928038344548e+02 -5.826614920857787e+02 3.086798087831643e+01 1.687565451967728e+02 + 5 1.300883610744345e+02 1.925906000780937e+01 -1.085065647314070e+02 -6.912594089657668e+01 + ME 3.395752332433782e-08 + +Event 223 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.513921720685539e+02 -1.134255090057421e+02 -9.735296175095155e+01 -4.259248146678862e+02 + 3 3.148626886899702e+02 2.189368402500763e+02 4.819786486657082e+01 2.210930540127695e+02 + 4 4.219582225753616e+02 -2.567437085475601e+02 -2.191804642337468e+02 2.531626627654306e+02 + 5 3.117869166661142e+02 1.512323773032259e+02 2.683355611181276e+02 -4.833090211031403e+01 + ME 1.932630882190357e-07 + +Event 224 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.315611776804204e+02 1.282183320794628e+02 -4.111637334587468e+02 -2.737688236334926e+01 + 3 1.674866220995890e+02 6.888658801083713e+01 -8.524503602723904e+01 1.266478993401409e+02 + 4 2.645671277337115e+02 -2.034555111374055e+02 -9.264145879346844e+01 1.414891521555669e+02 + 5 6.363850724862790e+02 6.350591047105598e+00 5.890502282794542e+02 -2.407601691323585e+02 + ME 1.033395869808800e-07 + +Event 225 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.542911364008525e+02 -6.906505534545287e+00 1.338332937438159e+02 -7.646374591170161e+01 + 3 6.009262400854107e+02 -8.940736359077172e+01 4.411714460658597e+01 -5.925979639778540e+02 + 4 4.328793970916515e+02 7.191964334477241e+01 1.665393425924042e+01 4.265381387417558e+02 + 5 3.119032264220853e+02 2.439422578054464e+01 -1.946043726096423e+02 2.425235711477999e+02 + ME 5.889772210850316e-05 + +Event 226 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.238244057688474e+02 -4.759413520569711e+02 -1.911585493182049e+02 1.064436306400156e+02 + 3 1.513714782200792e+02 5.621367009247984e+01 1.067079790997947e+02 -9.146996725539772e+01 + 4 3.309778079434476e+02 8.703450613686218e+01 -8.672921858056075e+01 3.073261243971419e+02 + 5 4.938263080676256e+02 3.326931758276291e+02 1.711797887989711e+02 -3.222997877817598e+02 + ME 2.012032683297799e-08 + +Event 227 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.441906188052046e+02 5.273333141456225e+01 9.291280776591432e+01 -9.683666899684232e+01 + 3 5.088005428111287e+02 2.120952898491664e+02 -3.923670854733692e+02 -2.448298401458270e+02 + 4 2.230296359111398e+02 5.963056642642841e+01 1.366700392970478e+02 -1.658544976805341e+02 + 5 6.239792024725272e+02 -3.244591876901571e+02 1.627842384104072e+02 5.075210068232034e+02 + ME 7.600210514827709e-08 + +Event 228 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.894755984215653e+02 1.858323987151167e+02 -6.826804175303918e+01 -4.476518239805803e+02 + 3 2.378337923954262e+02 -1.386422232509072e+02 -1.328849203068724e+02 -1.403026895404092e+02 + 4 3.773112740226970e+02 -1.117174380731088e+02 -1.040599922392212e+02 3.450427937954725e+02 + 5 3.953793351603124e+02 6.452726260889943e+01 3.052129542991329e+02 2.429117197255171e+02 + ME 2.095168317089492e-06 + +Event 229 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.452325252559780e+01 2.443195513199062e+01 -6.325325852480845e-01 2.017271769305098e+00 + 3 7.188955970603514e+02 6.816679247082510e+02 3.030179298338898e+01 -2.263217206515921e+02 + 4 2.627494813500970e+02 -2.618027083973374e+02 -2.041573691533987e+01 8.934736244973664e+00 + 5 4.938316690639541e+02 -4.442971714429038e+02 -9.253523482800947e+00 2.153697126373135e+02 + ME 2.440736751868551e-03 + +Event 230 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.769099184850985e+02 2.249558416904206e+02 2.307537550587783e+01 3.015352101945322e+02 + 3 1.778006365465800e+02 -1.387428732425598e+02 1.051237453064108e+02 -3.622816105629497e+01 + 4 4.556085275574102e+02 3.501957619527958e+02 -2.744336173624777e+02 -9.812363810355457e+01 + 5 4.896809174109114e+02 -4.364087304006567e+02 1.462344965501891e+02 -1.671834110346827e+02 + ME 1.553706769792429e-08 + +Event 231 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.014793994416338e+02 -5.768695854958890e+01 4.374811245688785e+02 -2.382521112678524e+02 + 3 2.526737499462902e+02 2.119664744384978e+02 -8.368970279598194e+01 1.091341892898568e+02 + 4 4.045018774470939e+02 -6.630541281823712e+01 -1.346697506713350e+02 3.756187153795021e+02 + 5 3.413449731649820e+02 -8.797410307067175e+01 -2.191216711015615e+02 -2.465007934015064e+02 + ME 2.231371465486821e-07 + +Event 232 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.414187826108433e+02 2.584315608658833e+02 2.849671207419295e+01 -2.212868068115540e+02 + 3 4.120935142349621e+02 3.628024247372866e+02 1.267412408575709e+02 1.487686893978351e+02 + 4 2.248548125131377e+02 -1.883103984200795e+02 9.094987429533833e+01 -8.262566745782664e+01 + 5 5.216328906410570e+02 -4.329235871830904e+02 -2.461878272271021e+02 1.551437848715456e+02 + ME 2.039994622498156e-08 + +Event 233 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.894891429646787e+01 3.536480069898002e+00 2.591327577389482e+01 1.241108723382166e+01 + 3 4.495734558856914e+02 -1.428629550884405e+02 4.257543295140539e+02 2.096948259829423e+01 + 4 3.410993816387652e+02 -7.182369067942484e+01 8.919355629468804e+01 3.213015019059832e+02 + 5 6.803782481790761e+02 2.111501656979673e+02 -5.408611615826370e+02 -3.546820717380990e+02 + ME 1.318981303693756e-05 + +Event 234 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.131015202000085e+02 -1.392455713354047e+02 9.575974407716110e+01 1.298191056405333e+02 + 3 4.325059163299788e+02 8.128751078497471e+01 1.232789394922660e+02 4.065169262508996e+02 + 4 1.602879687109333e+02 -1.232589269689068e+02 -1.390049118716281e+01 1.015196837106304e+02 + 5 6.941045947590799e+02 1.812169875193368e+02 -2.051381923822642e+02 -6.378557156020632e+02 + ME 4.336839675584952e-06 + +Event 235 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.910303275896592e+02 2.464345432925338e+02 2.918360128905574e+02 -3.085652930339882e+02 + 3 5.092125502368183e+02 -3.743627265831034e+02 -2.807984020263058e+02 2.007541473629421e+02 + 4 3.077109354739686e+02 1.158554772554631e+02 9.067751992059638e+01 2.702611988081597e+02 + 5 1.920461866995535e+02 1.207270603510670e+01 -1.017151307848479e+02 -1.624500531371134e+02 + ME 2.901290924106063e-07 + +Event 236 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.274482187963926e+02 -1.539135099798990e+02 1.447655980669331e+02 -8.417983900936365e+01 + 3 5.575149033864810e+02 5.352858954032090e+02 1.235129998854975e+02 9.506006806071186e+01 + 4 1.694314553150287e+02 1.372529159712161e+02 -9.883130012029676e+01 -1.005132946921373e+01 + 5 5.456054225020973e+02 -5.186253013945260e+02 -1.694472978321338e+02 -8.288995821344750e-01 + ME 8.043345014180297e-08 + +Event 237 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.339554322971692e+02 1.120133622069438e+02 -3.285326904336018e+02 -2.604545934011883e+02 + 3 6.302873937313195e+02 -1.022747807849330e+02 1.408185925137292e+02 6.057822974588029e+02 + 4 2.435720079051767e+02 6.470340927096325e+01 5.460699052093423e+01 -2.283831614850804e+02 + 5 1.921851660663347e+02 -7.444199069297396e+01 1.331071073989386e+02 -1.169445425725342e+02 + ME 1.497097920780776e-06 + +Event 238 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.083377096896616e+02 -5.784356003262199e+02 1.495271412220012e+02 1.145803619460331e+02 + 3 1.717721328115097e+02 5.705865046083076e+00 1.603017198195137e+02 -6.145296850109771e+01 + 4 3.206654997358106e+02 2.010041670743243e+02 -2.004126881056234e+02 -1.491926338190073e+02 + 5 3.992246577630189e+02 3.717255682058127e+02 -1.094161729358917e+02 9.606524037407164e+01 + ME 4.296625493436613e-08 + +Event 239 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.881743793489993e+02 1.049103829092727e+02 2.635007061567174e+02 5.104568861247707e+01 + 3 4.803894780132095e+02 1.090279613672190e+02 -2.314406086068728e+02 -4.065983262552767e+02 + 4 2.440501385224371e+02 -1.729404459755228e+02 -1.217772311844275e+02 -1.217471897978040e+02 + 5 4.873860041153544e+02 -4.099789830096894e+01 8.971713363458294e+01 4.772998274406037e+02 + ME 3.660224776715063e-07 + +Event 240 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.087013709129607e+02 -4.676309448785901e+02 -3.172734601441521e+02 2.262215945294168e+02 + 3 2.729447571190920e+02 -5.767450544615320e+00 2.662954895847357e+02 -5.960108371364922e+01 + 4 1.525519054527852e+02 8.110058050875803e+01 1.020251557883443e+02 7.928207417053407e+01 + 5 4.658019665151617e+02 3.922978149144474e+02 -5.104718522892793e+01 -2.459025849863017e+02 + ME 4.308829536838534e-08 + +Event 241 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.530270912109602e+02 -5.453728034889818e+02 8.831979059784145e+01 2.463906281857454e+01 + 3 1.982544179365819e+02 -1.586324861506672e+02 1.089318049583214e+02 4.769077937666253e+01 + 4 5.722723170577570e+02 5.530346801359586e+02 -1.096388820792352e+02 -9.812014560724825e+01 + 5 1.764461737947002e+02 1.509706095036902e+02 -8.761271347692811e+01 2.579030341201114e+01 + ME 1.218622007691242e-05 + +Event 242 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.746108664048627e+02 -4.749300259421787e+02 -3.143228612830267e+02 7.629060967091235e+01 + 3 5.051636989462200e+02 4.444288635815062e+02 9.034183757269773e+01 2.225122476014893e+02 + 4 3.777952608872155e+02 -2.186699418937968e+00 2.474964598096178e+02 -2.854294656761925e+02 + 5 4.243017376170172e+01 3.268786177961076e+01 -2.351543609928828e+01 -1.337339159620901e+01 + ME 5.951019815807999e-07 + +Event 243 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.107527202455985e+02 3.904995916125712e+02 3.043450795877113e+02 -3.576353265109072e+02 + 3 1.508627981883119e+02 3.391389294497834e+01 -9.072320151009265e+01 -1.156664707263216e+02 + 4 6.531020347957227e+02 -4.643459729748700e+02 -2.271350825653670e+02 3.991675581812210e+02 + 5 8.528244677036683e+01 3.993248841732056e+01 1.351320448774861e+01 7.413423905600774e+01 + ME 1.410484048413108e-06 + +Event 244 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.464269710040987e+02 3.069536149467578e+02 -1.106795177814204e+02 -1.163665280648364e+02 + 3 5.041376845601655e+02 6.573269089487459e+01 4.202350193677655e+02 2.706225172419645e+02 + 4 4.310807055576997e+02 -3.784254305479272e+02 -2.032279747057621e+02 3.637524591991033e+01 + 5 2.183546388780368e+02 5.739124706294892e+00 -1.063275268805829e+02 -1.906312350970385e+02 + ME 4.210409446454129e-08 + +Event 245 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.442357593704551e+02 7.260817279327249e+01 -1.113850962118338e+02 -5.590498964023364e+01 + 3 5.569552277539377e+02 4.707984336772996e+02 -2.468590917743929e+02 -1.661582058694779e+02 + 4 5.295813762554727e+02 -2.928127385193997e+02 4.173322561045921e+02 1.433559285556812e+02 + 5 2.692276366201345e+02 -2.505938679511724e+02 -5.908806811836556e+01 7.870726695403035e+01 + ME 7.270878934404332e-07 + +Event 246 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.246317093722425e+02 -2.157268871477893e+02 -2.199347190619293e+02 5.433740901680579e+02 + 3 1.880460358713021e+02 -1.070645769102478e+02 1.157715879471019e+02 -1.024471932370385e+02 + 4 1.420672142172262e+02 -1.168777621962524e+01 -1.208441773051402e+02 -7.377786967598762e+01 + 5 5.452550405392301e+02 3.344792402776624e+02 2.250073084199677e+02 -3.671490272550326e+02 + ME 1.813004405476584e-07 + +Event 247 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.065097258271049e+02 -1.668806543559496e+02 -1.865206091223489e+02 1.769438374201404e+02 + 3 3.129023884913621e+02 7.386736466907031e+01 2.951038514843158e+02 7.324775764191564e+01 + 4 3.864124550177795e+02 -1.802989827999254e+02 -3.272147765988919e+02 9.867802272804730e+01 + 5 4.941754306637530e+02 2.733122724868046e+02 2.186315342369250e+02 -3.488696177901034e+02 + ME 9.900426556209394e-09 + +Event 248 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.666327106485754e+02 -2.164686785201293e+02 -1.361374419638053e+02 -7.550437406605914e+01 + 3 3.506800144563603e+02 -3.287838552500039e+02 -1.212782822609880e+02 -1.300874005742749e+01 + 4 4.601802762612302e+02 3.201063220472946e+02 1.987803806208572e+02 -2.641669728134899e+02 + 5 4.225069986338337e+02 2.251462117228388e+02 5.863534360393606e+01 3.526800869369766e+02 + ME 8.556898600116979e-07 + +Event 249 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.764901453884635e+02 7.453429552391282e+01 8.820686664603214e+01 1.334651972855999e+02 + 3 6.230936441513318e+02 2.971067073789105e+02 9.787518090098752e+01 -5.388819376967908e+02 + 4 5.022078434333577e+02 -3.822006854333496e+02 -2.438015631086574e+02 2.160929241924341e+02 + 5 1.982083670268468e+02 1.055968253052626e+01 5.771951556163774e+01 1.893238162187566e+02 + ME 2.513325362889350e-05 + +Event 250 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.228636101802504e+02 -4.155243332015379e+01 5.704763635314592e+01 -1.005705184346682e+02 + 3 3.405147702190894e+02 7.985751885454557e+01 2.522537891570215e+01 -3.300557008723140e+02 + 4 3.193622656237184e+02 -1.644879368855030e+02 -2.193702205694558e+01 -2.728639631514912e+02 + 5 7.172593539769417e+02 1.261828513511112e+02 -6.033599321190265e+01 7.034901824584733e+02 + ME 1.031363863798651e-04 + +Event 251 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.961580549700980e+02 -3.878197351809906e+02 7.914688695318326e+01 -1.651750003655490e+01 + 3 3.066463839754016e+02 7.034111344423620e+01 1.893486642450590e+02 -2.307189110437774e+02 + 4 3.269242895484091e+02 -8.795629422661055e+01 -3.930068447480089e+01 3.124078065645385e+02 + 5 4.702712715060912e+02 4.054349159633650e+02 -2.291948667234414e+02 -6.517139548420626e+01 + ME 1.790407192334059e-08 + +Event 252 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.237658022858007e+02 1.405436681763271e+02 -1.649016760008568e+02 5.591107994180530e+01 + 3 3.054719110851954e+02 -2.214763823298169e+02 1.666458004254375e+02 -1.284152550622531e+02 + 4 5.392505931602816e+02 4.029685409231230e+02 5.432531306200936e+01 3.541981332682593e+02 + 5 4.315116934687225e+02 -3.220358267696331e+02 -5.606943748659005e+01 -2.816939581478114e+02 + ME 7.387058846856028e-09 + +Event 253 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.589202324543649e+02 -4.375953164514952e+02 2.198479485092992e+02 2.693864228256691e+02 + 3 4.197833585606816e+02 3.026019377854699e+02 -1.044594107420259e+02 -2.715480931260489e+02 + 4 2.738706420871745e+02 2.336491570328417e+02 7.464988490753908e+01 -1.218219795372532e+02 + 5 2.474257668977780e+02 -9.865577836681666e+01 -1.900384226748120e+02 1.239836498376324e+02 + ME 2.635793263425697e-07 + +Event 254 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.131109686858098e+02 -1.596979849442970e+01 4.209499761303451e+01 1.037644830809086e+02 + 3 2.488622312155221e+02 1.244501515047506e+02 -2.082237961096237e+02 5.556456289498273e+01 + 4 5.979154868428426e+02 -5.590425628529679e+02 -1.316391630725938e+02 -1.662692787945094e+02 + 5 5.401113132558255e+02 4.505622098426469e+02 2.977679615691829e+02 6.940232818618021e+00 + ME 7.769239627025203e-08 + +Event 255 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.625199149979433e+02 2.903215750787058e+02 -2.451585309797628e+02 -5.427184495671275e+02 + 3 2.457331142254968e+02 2.358267179437861e+01 -4.583225821432127e+01 2.402665709685297e+02 + 4 4.002681720338182e+02 -2.621095389751817e+02 2.783157645467164e+02 1.185476036294650e+02 + 5 1.914787987427401e+02 -5.179470789790255e+01 1.267502464736808e+01 1.839042749691328e+02 + ME 1.572612999555951e-07 + +Event 0 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.345677694047433e+02 2.100238310553183e+02 -2.816345401110253e+02 -2.557755312978801e+02 + 3 3.778855054923723e+02 -2.840270738478316e+02 2.448136425844732e+02 4.682261192186964e+01 + 4 2.920279085923864e+02 -1.576945398626579e+02 -2.408852190745838e+02 4.885737125284020e+01 + 5 3.955188165104976e+02 2.316977826551712e+02 2.777061166011359e+02 1.600955481231703e+02 + ME 6.360009844214530e-09 + +Event 1 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.725349105457669e+02 2.881950845301121e+02 -4.457540318931225e+02 4.129771691104841e+02 + 3 6.375781261318956e+02 -3.835006725455601e+02 4.372392202052156e+02 -2.612565126332686e+02 + 4 3.152489653934795e+01 3.054926600229694e+01 -1.796306705170926e+00 -7.571970071154194e+00 + 5 1.583620667829892e+02 6.475632201315099e+01 1.031111839307813e+01 -1.441486864060612e+02 + ME 1.166771281726837e-04 + +Event 2 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.782380757885197e+02 1.996205682030240e+02 4.001020415553458e+02 1.696515317568694e+02 + 3 2.717797792000939e+02 4.604565925658515e+01 1.990097475342964e+02 1.792739970882946e+02 + 4 6.916972528618887e+02 -2.565335659091062e+02 -5.673237009836265e+02 -3.012962619762857e+02 + 5 5.828489214949754e+01 1.086733844949693e+01 -3.178808810601573e+01 -4.762926686887830e+01 + ME 4.317439016534556e-05 + +Event 3 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.018313811485532e+02 -3.989539969780533e+02 2.934712901062139e+01 -3.798569717437193e+01 + 3 2.683933065123219e+02 2.366019084548150e+02 -7.671591945800627e+01 -1.008423105539992e+02 + 4 3.388431725058181e+02 -2.729478414859575e+02 1.984381945822547e+02 -3.060154102164474e+01 + 5 4.909321398333069e+02 4.352999300091959e+02 -1.510694041348699e+02 1.694295487500159e+02 + ME 4.947652021107378e-09 + +Event 4 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.025396283630530e+02 -2.571095877388792e+02 -4.112978625189389e+02 -1.314336572395788e+02 + 3 5.923738354103538e+02 2.675275915658220e+02 2.602417944619204e+02 4.600108227476859e+02 + 4 3.268206458950787e+01 -3.102096879508959e+01 -3.014419071639536e+00 9.835147101461047e+00 + 5 3.724044716370855e+02 2.060296496814673e+01 1.540704871286581e+02 -3.384123126095682e+02 + ME 4.137282647477196e-06 + +Event 5 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.074067282413408e+02 -4.618538476611251e+01 3.035171302884308e+02 1.559353787719173e+01 + 3 3.831379513801059e+02 -2.749543838833998e+01 -1.652445324602141e+02 -3.445764576223725e+02 + 4 5.614489455937761e+02 1.616386414426313e+02 -3.588979307262703e+02 4.003625150052696e+02 + 5 2.480063747847773e+02 -8.795781828817880e+01 2.206253328980537e+02 -7.137959526008893e+01 + ME 6.031082846485431e-07 + +Event 6 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.286636464658633e+02 -1.747226577501439e+01 -3.778773270997241e+02 -3.693084032177421e+02 + 3 1.310282673263388e+02 -1.083675617071470e+02 -6.717507586954586e+01 3.020575425495842e+01 + 4 6.582258651099336e+02 2.776185230670747e+02 5.267955454451471e+02 2.804918866008896e+02 + 5 1.820822210978650e+02 -1.517786955849134e+02 -8.174314247587706e+01 5.861076236189421e+01 + ME 6.633162370764102e-08 + +Event 7 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.262567818350127e+02 -6.135678852045160e+02 -8.153386103597421e+01 9.531126137330720e+01 + 3 1.336446614791592e+02 -4.519989455404551e+01 6.404586862299973e+01 -1.082404350809238e+02 + 4 3.176656341984029e+02 2.701266119371286e+02 1.170579115681671e+02 -1.193336248227656e+02 + 5 4.224329224874256e+02 3.886411678214330e+02 -9.956991915519264e+01 1.322627985303823e+02 + ME 9.690561203685568e-08 + +Event 8 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.349855291489960e+02 -3.626063542068543e+02 -1.229423826784078e+02 -2.064321026371652e+02 + 3 3.119050592775228e+02 -2.002874147789406e+02 -1.957942322506285e+02 1.372382457679728e+02 + 4 1.367195114173143e+02 9.785893219779987e+01 -8.501635945817435e+01 4.345195986113885e+01 + 5 6.163899001561674e+02 4.650348367879951e+02 4.037529743872106e+02 2.574189700805361e+01 + ME 4.712996050580979e-08 + +Event 9 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.655124632121682e+02 -6.972273330724498e+01 -3.084842179247440e+02 1.832364279147364e+02 + 3 1.839264563143947e+02 1.553544456242578e+02 -7.816914971633298e+01 -5.986252241407793e+01 + 4 2.954499153271153e+02 2.111586753349157e+01 -1.306467051615633e+02 2.641518711566042e+02 + 5 6.551111651463217e+02 -1.067475798505044e+02 5.173000728026402e+02 -3.875257766572626e+02 + ME 2.573714938734375e-08 + +Event 10 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.642811111709574e+02 1.645024039884384e+02 5.389597634493801e+02 -2.957202882697342e+01 + 3 3.050705971359639e+02 -1.347545052623103e+02 -2.204142360425299e+02 -1.622555302545695e+02 + 4 1.790656381713290e+02 -1.323273797872345e+02 -8.759515510166027e+00 -1.203213955220661e+02 + 5 4.515826535217495e+02 1.025794810611063e+02 -3.097860118966842e+02 3.121489546036091e+02 + ME 1.759839172050816e-07 + +Event 11 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.912845371100813e+02 -1.020148610633588e+02 3.329066770248591e+02 -1.785208713174465e+02 + 3 2.847080639014686e+02 2.638796146707180e+02 5.173703293990509e+01 9.353881565602217e+01 + 4 2.573001248270789e+02 -1.895266044096599e+02 1.605970764533154e+02 -6.701939638360365e+01 + 5 5.667072741613714e+02 2.766185080230083e+01 -5.452407864180796e+02 1.520014520450280e+02 + ME 4.929253318341834e-09 + +Event 12 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.295908994071822e+02 -3.523645455233028e+01 1.467318606160349e+01 -4.278918446697189e+02 + 3 2.043256044303077e+02 2.071061668376129e+01 1.800525177697819e+02 9.434571440797527e+01 + 4 2.514463210684131e+02 -2.896341838569114e+00 2.221301356297973e+02 -1.177924718642132e+02 + 5 6.146371750940953e+02 1.742217970713842e+01 -4.168558394611834e+02 4.513386021259574e+02 + ME 1.786795978263545e-07 + +Event 13 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.900763907650167e+02 -4.345521592554665e+02 1.816886031264379e+02 1.353829426687154e+02 + 3 8.155287447201781e+01 -3.586849648635566e+01 -4.644236448070596e+00 -7.309414040996307e+01 + 4 2.274805892576970e+02 -1.172993534843645e+02 1.746430859700105e+02 8.653365058779553e+01 + 5 7.008901455052688e+02 5.877200092261867e+02 -3.516874526483778e+02 -1.488224528465479e+02 + ME 8.929398436615371e-08 + +Event 14 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.553813512209236e+02 -2.884258797827895e+02 -3.779404323704547e+02 2.870888133228751e+02 + 3 1.895171303098595e+02 3.950181939875258e+00 1.673649699243205e+02 -8.882626630539811e+01 + 4 2.675598101961875e+02 1.545161726663045e+02 -2.183413083877818e+02 -6.330676763838189e+00 + 5 4.875417082730292e+02 1.299595251766097e+02 4.289167708339160e+02 -1.919318702536388e+02 + ME 8.622235410515352e-09 + +Event 15 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.826944038147662e+02 3.569588644211593e+02 -4.592592011223918e+02 3.455609255435130e+01 + 3 3.389524473661556e+02 -2.377392070550028e+02 9.239231435980069e+01 -2.232319225639668e+02 + 4 7.820502782875148e+01 1.594347126700341e+01 -7.655713718050529e+01 9.147941732036023e-01 + 5 5.001481209903263e+02 -1.351631286331600e+02 4.434240239430964e+02 1.877610358364119e+02 + ME 5.580162763607776e-08 + +Event 16 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.671075373493464e+02 -1.624326760732477e+01 1.171675550031839e+02 1.180374912939780e+02 + 3 4.295095098103945e+02 2.195907717116854e+02 -2.808696979999064e+02 -2.395214494404898e+02 + 4 3.847682842045415e+02 -1.737210238377358e+02 -3.429919065735126e+02 1.497299006686934e+01 + 5 5.186146686357174e+02 -2.962648026662482e+01 5.066940495702352e+02 1.065109680796425e+02 + ME 2.098251646724183e-07 + +Event 17 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.032955713685548e+02 6.495895482552957e+02 2.251021895685562e+02 -1.482804226981584e+02 + 3 2.234221561647206e+02 -1.180503892545828e+02 -2.021783199888668e+01 1.886075415549418e+02 + 4 3.595704725836891e+02 -3.353015999954382e+02 -1.206079698882031e+02 -4.814020558746078e+01 + 5 2.137117998830352e+02 -1.962375590052745e+02 -8.427638768146636e+01 7.813086730677433e+00 + ME 2.416002605335386e-05 + +Event 18 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.215031941006907e+02 3.997910206623500e+02 -2.192085154561077e+02 2.531409646945914e+02 + 3 4.430902120680963e+02 8.075108458464338e+01 3.816053226272268e+02 -2.102036538967018e+02 + 4 2.188624192138139e+02 -1.829448025271727e+02 -1.088960009329281e+02 5.073084617106944e+01 + 5 3.165441746173994e+02 -2.975973027198207e+02 -5.350080623819097e+01 -9.366815696895908e+01 + ME 6.469828043384208e-08 + +Event 19 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.615465159306794e+02 -1.661354327315054e+02 1.685644476861035e+02 1.113176758779641e+02 + 3 6.118064822100384e+02 5.530315541715971e+02 -1.761644968719660e+02 1.934666426199023e+02 + 4 1.123687407649047e+02 7.588942961656002e+01 -1.456870938581165e+01 -8.157990610680608e+01 + 5 5.142782610943772e+02 -4.627855510566516e+02 2.216875857167411e+01 -2.232044123910603e+02 + ME 9.787803276569699e-08 + +Event 20 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.201161646354791e+02 -1.169238528739237e+02 1.621234917435766e+02 -4.801724802288322e+02 + 3 2.564308777002585e+02 1.861146313022993e+02 1.511095254422972e+02 -9.101675875319815e+01 + 4 6.849003476946968e+02 -5.145604386088208e+01 -3.163529905442016e+02 6.052780742735055e+02 + 5 3.855260996956564e+01 -1.773473456749349e+01 3.119973358328020e+00 -3.408883529147523e+01 + ME 2.698358389155324e-03 + +Event 21 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.051714469465807e+02 4.769595747359659e+02 1.465851962980007e+02 7.887036898648940e+01 + 3 2.982914519086229e+02 -2.692776124268369e+02 -2.614768404820439e+01 -1.256330225154668e+02 + 4 4.082168347698474e+02 -2.526674124215031e+02 -2.413280677431799e+02 -2.110945916149397e+02 + 5 2.883202663749494e+02 4.498545011237396e+01 1.208905554933834e+02 2.578572451439171e+02 + ME 2.522751227763643e-07 + +Event 22 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.495241619209652e+02 3.451716010099906e+01 -9.553928903830848e+01 3.344367648377936e+02 + 3 6.022361346253780e+02 4.159349120916945e+01 7.293136399340369e+01 -5.963550615896627e+02 + 4 1.609604585668471e+02 1.426706346559205e+02 4.848087149789286e+01 -5.659473763285189e+01 + 5 3.872792448868096e+02 -2.187812859660891e+02 -2.587294645298800e+01 3.185130343847212e+02 + ME 2.361700381695961e-05 + +Event 23 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.973901127820891e+02 4.157619482135283e+02 4.273300237487649e+02 3.749666745683650e+01 + 3 2.020962069046400e+02 -8.124921004062800e+01 1.541203841174710e+02 -1.024126452763532e+02 + 4 4.866251843058940e+02 -1.871724592614197e+02 -4.344368137067278e+02 1.141717801872393e+02 + 5 2.138884960073768e+02 -1.473402789114806e+02 -1.470135941595083e+02 -4.925580236772270e+01 + ME 1.588834023877901e-07 + +Event 24 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.729751897416456e+02 5.325486799695835e+02 -2.114043177632211e+02 8.282229144929715e-01 + 3 2.209760217295413e+02 4.190554168819079e+01 2.140047851899925e+02 3.572505662723610e+01 + 4 6.217469614831004e+02 -6.139571429576340e+02 -6.952416455108568e+01 -6.922644919567129e+01 + 5 8.430182704571223e+01 3.950292129985990e+01 6.692369712431422e+01 3.267316965394235e+01 + ME 3.604073696319397e-08 + +Event 25 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.927699720954060e+02 -2.047743757602093e+02 1.139305300318160e+01 4.480623826814639e+02 + 3 3.149042319173597e+02 -7.235220864995340e+01 2.398037103748990e+01 -3.055401364597459e+02 + 4 2.888907726753627e+02 7.777131619454690e+01 2.777989383587109e+02 1.540294650737276e+01 + 5 4.034350233118719e+02 1.993552682156157e+02 -3.131723623993824e+02 -1.579251927290908e+02 + ME 3.037671037738176e-08 + +Event 26 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.781578630360667e+02 -2.003585770703804e+02 -8.178895358715289e+01 -1.747535534237924e+02 + 3 4.314269515912293e+02 -8.376259389173956e+00 2.624301036293826e+02 3.423295101901347e+02 + 4 4.627048772771270e+02 1.415704984190216e+02 -4.400078394753884e+02 2.113524624015278e+01 + 5 3.277103080955769e+02 6.716433804053281e+01 2.593666894331585e+02 -1.887112030064950e+02 + ME 4.015206612066857e-08 + +Event 27 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.288747384351032e+02 -3.287538335736823e+02 7.343114151656756e+00 -5.058573506526161e+00 + 3 3.748617761535498e+02 2.826910393495446e+02 2.461509109435953e+02 4.105670969105075e+00 + 4 4.493229387685308e+02 -2.643020753278851e+02 -3.392708645659920e+02 1.301183950777378e+02 + 5 3.469405466428156e+02 3.103648695520229e+02 8.577683947073986e+01 -1.291654925403164e+02 + ME 3.568624835218510e-09 + +Event 28 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.972279565562765e+02 2.667568883382366e+02 -1.717272400122293e+02 -6.208682154576160e+02 + 3 5.905663106606714e+02 -1.231890261491060e+02 2.466624534377953e+02 5.222553639627256e+02 + 4 1.643113640003870e+02 -9.794033068572114e+01 -8.940761721344892e+01 9.701646225797271e+01 + 5 4.789436878266502e+01 -4.562753150340967e+01 1.447240378788296e+01 1.596389236917516e+00 + ME 1.394543579819096e-05 + +Event 29 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.266188853291728e+02 -3.664030707671374e+00 1.148798248336851e+02 6.159873604030605e+02 + 3 2.715483186965963e+02 -1.390232669167604e+02 8.666144865649636e+01 -2.165659575267933e+02 + 4 3.262257817760322e+02 -5.448790208210592e+01 -2.425277014775895e+02 -2.112691251415396e+02 + 5 2.756070141981987e+02 1.971751997065377e+02 4.098642798740789e+01 -1.881522777347276e+02 + ME 2.564168485106579e-07 + +Event 30 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.746957600003899e+02 -2.591574015416286e+02 -1.764470547739902e+02 2.051847719740561e+02 + 3 4.260450496435592e+02 1.858543667698223e+02 -3.681463334087965e+02 -1.069617495931632e+02 + 4 2.090363679505942e+02 1.796219811338596e+02 6.625234226347425e+01 -8.392123786102671e+01 + 5 4.902228224054565e+02 -1.063189463620533e+02 4.783410459193126e+02 -1.430178451986609e+01 + ME 1.301553736708785e-08 + +Event 31 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.268147451203489e+02 1.033619585238285e+02 -7.175680626342714e+01 -1.578752244433521e+01 + 3 3.789446667338626e+02 -1.394826691605475e+01 3.303299056302544e+02 -1.851665729617091e+02 + 4 4.489033212820731e+02 -1.541836769602651e+02 -2.486905723036657e+02 -3.404329373964335e+02 + 5 5.453372668637156e+02 6.476998535249120e+01 -9.882527063161541e+00 5.413870328024778e+02 + ME 7.765861908673737e-07 + +Event 32 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.594239401383120e+02 6.472187280476992e+02 -9.434320337386465e+01 -8.394766744235476e+01 + 3 1.799463961036491e+02 -4.308808552135048e+01 1.452564949994532e+02 9.708075513339404e+01 + 4 5.532563487173284e+02 -5.363715880205567e+02 -1.284853434295565e+02 -4.346979967891591e+01 + 5 1.073733150407109e+02 -6.775905450579199e+01 7.757205180396799e+01 3.033671198787660e+01 + ME 4.932353313235260e-08 + +Event 33 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.632108428205664e+02 1.176561246583502e+02 1.493047995597393e+02 -6.353847143216234e+02 + 3 5.142529371160995e+02 -2.302785097856058e+02 -1.735375888118208e+02 4.258081687013944e+02 + 4 1.166729919959779e+02 1.150834163714542e+02 1.881380173106929e+01 3.799368630713534e+00 + 5 2.058632280673559e+02 -2.461031244198737e+00 5.418987521012294e+00 2.057771769895156e+02 + ME 6.520437368575939e-07 + +Event 34 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.690779677366501e+02 3.511855664846810e+02 -5.534458382142302e+01 3.060062331888240e+02 + 3 3.742689592978852e+02 -3.012002941387806e+02 8.305424587573982e+01 -2.060524907557447e+02 + 4 2.980406440054937e+02 2.332618722083638e+02 1.822761656253146e+02 -3.453293931873203e+01 + 5 3.586124289599713e+02 -2.832471445542644e+02 -2.099858276796314e+02 -6.542080311434725e+01 + ME 6.521450501563154e-09 + +Event 35 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.283056560696552e+02 -3.779242940990954e+02 5.849933729772783e+01 -3.644968061596919e+02 + 3 3.435696257780648e+02 3.183925539049349e+02 4.269218267655494e+01 1.218345062512435e+02 + 4 3.087475922619403e+02 2.738015531924525e+02 9.348544997317434e+01 1.077880134349780e+02 + 5 3.193771258903396e+02 -2.142698129982919e+02 -1.946769699474571e+02 1.348742864734704e+02 + ME 8.201282417926274e-06 + +Event 36 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.861171657357098e+02 1.949976067191706e+02 -3.323631134924989e+02 2.443685411720820e+01 + 3 4.856425374020077e+02 -3.587793340935829e+02 3.248213110244531e+02 -4.021416996509027e+01 + 4 3.386102480932667e+02 1.540387599323064e+02 2.845112873791638e+02 9.991140028120809e+01 + 5 2.896300487690154e+02 9.742967442106188e+00 -2.769694849111184e+02 -8.413408443332618e+01 + ME 7.094656500602908e-08 + +Event 37 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.304409112998236e+02 2.170475590461168e+02 -3.972923294204467e+01 -4.823686407122115e+02 + 3 2.292982173116324e+02 5.858630347836768e+01 -2.026677724490634e+02 8.983925376787056e+01 + 4 1.467869786978192e+02 -1.013820811213810e+02 -4.989060265398779e+01 -9.369641673806765e+01 + 5 5.934738926907255e+02 -1.742517814031036e+02 2.922876080450957e+02 4.862258036824082e+02 + ME 4.817842476217318e-08 + +Event 38 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.017706346629274e+02 2.384436739982338e+02 -4.258734741499015e+02 1.164051895567999e+02 + 3 4.770954291403306e+02 -2.105777645255067e+02 4.274009986857632e+02 2.460568870096300e+01 + 4 2.156156104165689e+02 -2.877166591024517e+01 1.906669901245644e+02 9.647995425422904e+01 + 5 3.055183257801733e+02 9.057564375181597e-01 -1.921945146604260e+02 -2.374908325119925e+02 + ME 2.742745272240175e-07 + +Event 39 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.689399151263620e+02 -2.579554373803593e+00 -1.305084655116407e+02 1.072454253193667e+02 + 3 1.752066215161591e+02 -1.477482509589512e+02 8.725713980331183e+01 -3.541194876059084e+01 + 4 6.166504538775522e+02 5.541135799160111e+01 3.673230597344312e+02 -4.922002981106906e+02 + 5 5.392030094799273e+02 9.491644734115388e+01 -3.240717340261026e+02 4.203668215519151e+02 + ME 2.705673710338525e-05 + +Event 40 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.915352846637887e+02 2.413866074952699e+02 1.305008487961100e+02 9.846246182894077e+01 + 3 4.766337666546407e+02 -4.064718254101213e+02 2.442499654159483e+02 4.798288296322364e+01 + 4 5.420029607651860e+02 3.967025561147059e+01 -4.874329454717395e+02 -2.336720008188668e+02 + 5 1.898279879163847e+02 1.254149623033808e+02 1.126821312596812e+02 8.722665602670243e+01 + ME 4.217571079806621e-06 + +Event 41 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.178258392081233e+02 -3.063771146787844e+01 2.817624231441971e+01 -1.102286641039828e+02 + 3 6.481000691270629e+02 4.281730940284307e+02 -2.495995064512898e+01 4.858791022635328e+02 + 4 5.135691872852316e+02 -3.271780048393351e+02 2.027042603872939e+02 -3.400277137219925e+02 + 5 2.205049043795825e+02 -7.035737772121716e+01 -2.059205520565847e+02 -3.562272443755763e+01 + ME 1.163929313556577e-07 + +Event 42 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.407568007632807e+02 -6.712702422129367e+01 5.650132897991668e+02 2.946578060970235e+02 + 3 2.519649653275297e+02 9.668586554572211e+01 -2.235157404081881e+02 -6.464441929476367e+01 + 4 1.611455149449823e+02 -1.500904782980546e+02 -2.019767982646422e+01 5.507067314617083e+01 + 5 4.461327189642074e+02 1.205316369736260e+02 -3.212998695645146e+02 -2.850840599484306e+02 + ME 1.550132291421948e-08 + +Event 43 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.656037718110565e+01 5.565869919512737e+01 5.206539740066112e+01 -7.266013455170691e+00 + 3 5.180921726156695e+02 9.446420117550205e+01 -5.353707605507631e+01 -5.065864146514116e+02 + 4 4.045145408547239e+02 -6.817883407235416e+01 -3.582159575777363e+02 1.751142143976551e+02 + 5 5.008329093485007e+02 -8.194406629827523e+01 3.596876362321515e+02 3.387382137089272e+02 + ME 3.587004551261473e-06 + +Event 44 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.113647181056540e+02 -5.107143134714064e+01 9.810164906833063e+01 -1.303364066484094e+01 + 3 6.573281629571673e+02 -3.985633843498063e+02 -2.702857905045169e+02 -4.474071232374617e+02 + 4 1.273704188821621e+02 8.136377415273439e+01 -8.144355608270773e+01 5.449868837360310e+01 + 5 6.039367000550166e+02 3.682710415442125e+02 2.536276975188940e+02 4.059420755286995e+02 + ME 4.057570198187262e-07 + +Event 45 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.791094112024887e+02 -3.344017621786854e+02 5.108659183713403e+00 -4.727759228180172e+02 + 3 2.010161055980396e+02 -2.492393218233912e+01 -1.782237380498131e+02 8.956880880040752e+01 + 4 3.598579655450536e+02 2.990024010429074e+02 -1.090682288963148e+02 1.679268917780071e+02 + 5 3.600165176544180e+02 6.032329331811717e+01 2.821833077624145e+02 2.152802222396027e+02 + ME 4.014095331371912e-08 + +Event 46 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.267953668800280e+02 -6.352714736069532e+01 -9.200146001961876e+01 5.980884520463429e+01 + 3 7.280210303874185e+02 4.206450303526108e+02 5.658444328595525e+02 -1.813627771287734e+02 + 4 6.518774346619087e+01 2.681482978220009e+01 -4.773984775270041e+01 3.537391324853007e+01 + 5 5.799958592663626e+02 -3.839327127741155e+02 -4.261031250872333e+02 8.618001867560903e+01 + ME 9.000760615385622e-06 + +Event 47 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.159530641405004e+02 1.142445149356236e+02 -2.007237833711747e+02 -4.613737291960360e+02 + 3 3.022880532645047e+02 -2.500992722738971e+02 1.529012103800725e+02 7.382168393378667e+01 + 4 3.701099938195063e+02 2.419009630277547e+02 2.308398354487969e+02 1.586767216112436e+02 + 5 3.116488887754887e+02 -1.060462056894813e+02 -1.830172624576946e+02 2.288753236510054e+02 + ME 5.168291445254857e-08 + +Event 48 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.117281863235952e+02 1.108482515422887e+02 3.904601382797303e+02 -6.909156936926900e+01 + 3 4.416457800211124e+02 -1.761655849232860e+02 -3.193915704670098e+02 2.490094504506014e+02 + 4 2.951205671254978e+02 -2.720452019398314e+02 2.727574676948143e+00 -1.143683416788644e+02 + 5 3.515054665297945e+02 3.373625353208287e+02 -7.379614248966868e+01 -6.554953940246799e+01 + ME 8.203574828107892e-09 + +Event 49 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.790151177355860e+02 2.685445491881807e+02 -1.264003222356048e+02 -4.971572087442830e+02 + 3 5.825462968193997e+02 -3.620774874303168e+02 2.071002122380677e+02 4.066565911492056e+02 + 4 1.232538965502943e+02 1.411023551560036e+01 -5.744174282575164e+01 -1.081335768837097e+02 + 5 2.151846888947205e+02 7.942270272653563e+01 -2.325814717671124e+01 1.986341944787866e+02 + ME 4.262707636225685e-08 + +Event 50 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.145744464521338e+02 1.949057366487759e+00 -3.132665042327794e+02 -2.858987341389516e+01 + 3 5.210452536770442e+02 3.365475988572654e+02 2.288931725040629e+02 -3.253179762383411e+02 + 4 4.024143942462561e+02 -3.880355946879494e+02 2.653164231204763e+01 1.032559630625573e+02 + 5 2.619659056245661e+02 4.953893846419622e+01 5.784168941666873e+01 2.506518865896790e+02 + ME 4.645051983526767e-08 + +Event 51 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.804391138743968e+02 5.017232555386195e+02 -1.014098872260621e+02 2.736775009748180e+02 + 3 4.713537758174335e+01 8.111299990939797e+00 -4.333873138520374e+00 4.622951628396847e+01 + 4 6.266947405808535e+02 -5.426030878998814e+02 2.770731071369993e+02 -1.468287443747501e+02 + 5 2.457307679630062e+02 3.276853237032218e+01 -1.713293467724168e+02 -1.730782728840363e+02 + ME 1.863828179212747e-06 + +Event 52 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.239697017241610e+02 -1.405450188662117e+02 3.730121131586497e+02 3.400783871899554e+02 + 3 1.578462691260114e+02 -1.307296241737952e+02 -8.016414633287303e+01 3.740213473989415e+01 + 4 2.346875433863188e+02 3.589966406507264e+01 1.851707703338134e+02 -1.396468508585222e+02 + 5 5.834964857635084e+02 2.353749789749342e+02 -4.780187371595901e+02 -2.378336710713274e+02 + ME 4.946848397303685e-08 + +Event 53 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.236952601159668e+02 -3.969697599659207e+01 1.283750074878056e+02 -1.788395284817703e+02 + 3 4.313350831023798e+02 -3.366597921026201e+02 2.109624625450273e+02 -1.679433764496105e+02 + 4 2.801401728473172e+02 2.616468403601891e+02 9.956816004668734e+01 1.027759102960049e+01 + 5 5.648294839343358e+02 1.147099277390230e+02 -4.389056300795202e+02 3.365053139017804e+02 + ME 8.501496023283101e-08 + +Event 54 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.424957591900097e+02 1.303189036682192e+02 1.786751139223700e+02 -2.615253947032346e+02 + 3 4.181611430197448e+02 -3.495405289942557e+02 -1.455704883643566e+02 1.774525092506670e+02 + 4 4.253746812368033e+02 1.528889148664904e+02 -1.709820665168174e+02 3.582369775412607e+02 + 5 3.139684165534423e+02 6.633271045954612e+01 1.378774409588040e+02 -2.741640920886931e+02 + ME 3.545099883776616e-05 + +Event 55 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.902820092019048e+02 4.969094085134601e+02 -1.717027031534157e+02 -4.473125011537593e+02 + 3 3.505542265094328e+02 -3.042029506312312e+02 -2.771059843041887e+01 1.719911430437845e+02 + 4 3.123150316279190e+02 -2.383922595804614e+02 1.302132333211215e+02 1.541243764656621e+02 + 5 1.468487326607426e+02 4.568580169823257e+01 6.920006826271272e+01 1.211969816443130e+02 + ME 5.553897698014209e-07 + +Event 56 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.517045596935313e+02 3.287009769062940e+02 7.982883452857163e+01 4.358450942081273e+02 + 3 5.634713549544337e+02 1.236880428650323e+01 8.234347921332713e+00 -5.632753998258917e+02 + 4 2.247534233722243e+02 -1.830483440080827e+02 -7.950172450526782e+01 1.033773711898374e+02 + 5 1.600706619798101e+02 -1.580214371847146e+02 -8.561457944636564e+00 2.405293442792710e+01 + ME 6.113752122051862e-06 + +Event 57 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.612663949947443e+02 6.166031259026278e+01 1.487802576587276e+02 8.324117783561414e+00 + 3 1.820024643451797e+02 1.879353241782060e+01 -1.625270683069972e+02 -7.972861615826162e+01 + 4 5.924987722813793e+02 -1.264958307278939e+02 5.759352761456844e+02 5.789782080869779e+01 + 5 5.642323683786965e+02 4.604198571981062e+01 -5.621884654974148e+02 1.350667756600244e+01 + ME 1.419246782436345e-08 + +Event 58 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.537542945315817e+02 1.629560282904648e+02 -2.140600948134415e+02 2.297078787325215e+02 + 3 2.777758408292174e+02 2.346896280376003e+02 7.436606577911790e+01 1.286463544002993e+02 + 4 3.282171909538803e+02 1.357516725902970e+02 1.084620089653986e+02 -2.784492780342276e+02 + 5 5.402526736853205e+02 -5.333973289183622e+02 3.123202006892498e+01 -7.990495509859326e+01 + ME 5.900203600208378e-08 + +Event 59 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.976381540886910e+02 -6.639185065184326e+01 -1.092066538588677e+02 1.507543328450558e+02 + 3 5.908605962463504e+02 2.928235141717452e+02 4.943985123342441e+02 -1.376253782788215e+02 + 4 3.266674027313050e+02 -3.263528609865800e+02 -1.366055614966453e+01 -4.334897781080566e+00 + 5 3.848338469336542e+02 9.992119746667828e+01 -3.715313023257117e+02 -8.794056785153776e+00 + ME 1.581886554928656e-08 + +Event 60 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.121408887067226e+02 -1.629155490191453e+02 -1.421475012227830e+02 -6.785257969686268e+02 + 3 2.741179903154751e+02 1.475044814487676e+02 1.545489065904012e+02 1.717490496009702e+02 + 4 6.757364169575578e+00 1.698500861258775e-01 -6.755016584407215e+00 5.359489612222504e-02 + 5 5.069837568082268e+02 1.524121748425186e+01 -5.646388783210987e+00 5.067231524715342e+02 + ME 2.496265208416112e-01 + +Event 61 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.060218431628060e+02 -3.666102684855926e+01 -1.896168417000883e+02 -4.677173593228888e+02 + 3 6.566657576617764e+02 1.298514611056881e+02 2.926865236287042e+02 5.733089169203428e+02 + 4 1.062674698969952e+02 1.030596485895223e+02 -2.044593557069221e+01 -1.592004112909394e+01 + 5 2.310449292784226e+02 -1.962500828466512e+02 -8.262374635792366e+01 -8.967151646835990e+01 + ME 8.006232376536141e-07 + +Event 62 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.055086643741489e+02 -7.294551827191355e+00 9.237032345627719e+01 -5.046376042508069e+01 + 3 5.688846753260368e+02 3.341862040344009e+02 -2.722490171883782e+02 3.712543972720730e+02 + 4 4.946635586748539e+02 -1.135243476177038e+02 -1.573702928991467e+01 -4.812032883179144e+02 + 5 3.309431016249599e+02 -2.133673045895060e+02 1.956157230220158e+02 1.604126514709220e+02 + ME 1.208629811005688e-07 + +Event 63 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.158808969828534e+02 -1.476287203313588e+02 1.676739582816469e+02 2.233202761197066e+02 + 3 3.883323257562104e+02 -3.056810783903564e+02 2.043617890712476e+02 1.248892818016628e+02 + 4 2.887403188019102e+02 1.019387632956730e+02 -1.499858918976024e+01 -2.697304257251938e+02 + 5 5.070464584590261e+02 3.513710354260422e+02 -3.570371581631342e+02 -7.847913219617561e+01 + ME 5.311609593885341e-07 + +Event 64 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.850523188566764e+02 2.550712447674044e+01 -2.660317865305242e+02 9.915291040546425e+01 + 3 2.916452653107142e+02 1.403399316576783e+01 2.690561401486174e+02 -1.116637867095131e+02 + 4 4.714598286310267e+02 -1.052157000050604e+02 -2.066429807719118e+02 4.104908098641694e+02 + 5 4.518425872015820e+02 6.567458236255209e+01 2.036186271538185e+02 -3.979799335601205e+02 + ME 1.616151563560158e-08 + +Event 65 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.214350019381618e+02 7.995884478713781e+01 -7.224395149801894e+01 5.598083875737888e+01 + 3 6.120578807552534e+02 9.201985783628675e+01 5.195324550411797e+02 -3.102148018990814e+02 + 4 5.322028765827238e+02 4.286174464190788e+00 -3.665653478175947e+02 3.858126181590675e+02 + 5 2.343042407238607e+02 -1.762648770876154e+02 -8.072315572556603e+01 -1.315786550173648e+02 + ME 2.512981208009846e-08 + +Event 66 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.420838586983577e+02 1.172583084765124e+02 -5.279399036277819e+02 -3.721366834936305e+01 + 3 1.564392099528226e+02 1.132387905244197e+02 6.972777909444964e+01 -8.239077347485902e+01 + 4 4.857304734552536e+02 1.638606370619556e+01 4.794549877218303e+02 -7.608222202289350e+01 + 5 3.157464578935655e+02 -2.468831627071277e+02 -2.124286318849815e+01 1.956866638471152e+02 + ME 3.234202721838722e-08 + +Event 67 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.575902343233387e+02 1.284720285965683e+02 -3.052315686261780e+02 -1.349051632284906e+02 + 3 2.817442134209691e+02 1.513959995933571e+01 2.809460911141384e+02 1.482862761546491e+01 + 4 3.417076894701672e+02 1.623591614032428e+02 -1.451356906432049e+02 -2.633235254454033e+02 + 5 5.189578627855242e+02 -3.059707899591467e+02 1.694211681552445e+02 3.834000610584290e+02 + ME 9.261561784325655e-09 + +Event 68 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.312846117834605e+02 -2.146983334976369e+01 -8.138388544700084e+01 -1.007539514689958e+02 + 3 6.081561718481714e+02 -1.919908989361123e+02 -4.948570314768851e+02 2.968331896547476e+02 + 4 6.188990257047044e+02 3.200282424269924e+02 4.846263547333212e+02 -2.139047086092678e+02 + 5 1.416601906636632e+02 -1.065675101411164e+02 9.161456219056456e+01 1.782547042351602e+01 + ME 9.108685454807970e-08 + +Event 69 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.931414138169077e+02 -3.273293746991303e+02 -5.999656711319622e+02 1.155062486203778e+02 + 3 5.318906858450802e+02 2.665952958823760e+02 4.596824129140024e+02 -2.295058081397401e+01 + 4 9.890991607956896e+01 7.762108251097791e+01 3.690462310319209e+01 4.895087172158099e+01 + 5 1.760579842584429e+02 -1.688700369422366e+01 1.033786351147677e+02 -1.415065395279848e+02 + ME 2.997028652760413e-07 + +Event 70 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.380439264868779e+02 -3.354639703644540e+02 -1.520232726263372e+02 2.371398968949265e+02 + 3 2.951328688021417e+02 2.031166168690902e+02 6.863385339078722e+01 2.028212128150538e+02 + 4 2.729251659354400e+02 2.078106869723441e+02 -1.750231714082777e+02 -2.587960685865386e+01 + 5 4.938980387755408e+02 -7.546333347698067e+01 2.584125906438277e+02 -4.140815028513266e+02 + ME 9.592826222215548e-08 + +Event 71 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.516725804906697e+02 -2.115628301998922e+02 -4.748726394059173e+02 -1.846070997732852e+02 + 3 1.486032143367139e+02 3.144790462194493e-03 1.024005313005051e+02 -1.076895839470461e+02 + 4 6.474143806083582e+02 3.410247127162772e+02 4.478272683866318e+02 3.198409967853593e+02 + 5 1.523098245642589e+02 -1.294650273068472e+02 -7.535516028121995e+01 -2.754431306502833e+01 + ME 3.911029737863209e-07 + +Event 72 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.330749580482132e+02 -3.265677636286840e+02 5.763987838257717e+00 -6.526254577917835e+01 + 3 6.712248537502356e+02 5.525262209628081e+02 3.205348494218363e+02 2.061916335516648e+02 + 4 1.960894953123145e+02 -1.804902499505635e+02 -3.994939157176093e+01 -6.540952497662286e+01 + 5 2.996106928892366e+02 -4.546820738356064e+01 -2.863494456883330e+02 -7.551956279586351e+01 + ME 2.336724102883240e-08 + +Event 73 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.244112630934241e+02 1.874119439001354e+02 2.373895141185757e+02 2.977379755937671e+02 + 3 2.512397615119820e+02 -1.915148586935148e+02 1.506486288779340e+02 -6.122472769558973e+01 + 4 3.648202995074480e+02 -4.800070047693638e+01 -2.942119755412287e+02 2.103071495088906e+02 + 5 4.595286758871462e+02 5.210361527031584e+01 -9.382616745528102e+01 -4.468203974070680e+02 + ME 1.469359655839249e-08 + +Event 74 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.175955016648010e+02 -2.448313074458401e+02 -4.559235086301704e+02 9.822856832534079e+00 + 3 6.621390955434566e+02 2.137341741969738e+02 6.260267976401974e+02 2.891942710019760e+01 + 4 2.303718015973827e+02 1.812134960949857e+01 -2.278277430547247e+02 2.893619107872612e+01 + 5 8.989360119435972e+01 1.297578363936777e+01 5.772445404469759e+01 -6.767847501145771e+01 + ME 3.579224468849613e-08 + +Event 75 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.234767238744632e+02 1.148240181757038e+02 1.757017011162820e+02 -7.672159532030987e+01 + 3 5.117399613751516e+02 -4.727573887809738e+02 1.390803688823399e+02 1.379669903012637e+02 + 4 2.544294994045669e+02 -3.670658913956010e+01 -4.967306761896027e+01 -2.468189272205846e+02 + 5 5.103538153458186e+02 3.946399597448302e+02 -2.651090023796617e+02 1.855735322396308e+02 + ME 1.870732328495155e-08 + +Event 76 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.909285481378914e+02 -3.305040374270265e+02 -1.794570557368611e+02 1.067116495041263e+02 + 3 4.098525442038565e+02 2.778747229690340e+02 -2.854331647728487e+02 -9.639841685947228e+01 + 4 4.418316193782279e+02 2.211223520423719e+02 3.525074401020419e+02 1.485213452078249e+02 + 5 2.573872882800243e+02 -1.684930375843794e+02 1.123827804076678e+02 -1.588345778524789e+02 + ME 6.487990429439647e-09 + +Event 77 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.911537813436147e+02 -2.642087240745080e+02 -4.088350218788157e+02 6.541950730783678e+01 + 3 3.180168689971177e+02 3.520456914156713e+01 -8.253704519800402e+01 -3.050950728023466e+02 + 4 3.461489397531647e+02 1.250503078887523e+02 3.201771174570674e+02 -4.084265473804997e+01 + 5 3.446804099061027e+02 1.039538470441886e+02 1.711949496197523e+02 2.805182202325598e+02 + ME 6.847746249442819e-08 + +Event 78 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.642050168228875e+02 4.544857132649138e+02 1.463535046289702e+01 -3.339997664116333e+02 + 3 1.651631715519658e+02 1.196484546942684e+02 -1.099426959841972e+02 -2.959263634933707e+01 + 4 2.320532834071474e+02 -7.238565948697823e+01 1.373958575216963e+02 1.724280167970126e+02 + 5 5.385785282179988e+02 -5.017485084722041e+02 -4.208851200039619e+01 1.911643859639578e+02 + ME 2.507307280648115e-07 + +Event 79 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.982601538273627e+02 -1.711694840703935e+02 -2.385903670700898e+02 -5.229496940965934e+01 + 3 3.869985664719069e+02 -7.300257822028927e+01 3.454374333335043e+02 1.584660647471954e+02 + 4 2.730442121954245e+02 -2.168309222333608e+02 1.645934868985690e+02 -2.113000348842465e+01 + 5 5.416970675053060e+02 4.610029845240435e+02 -2.714405531619834e+02 -8.504109184911134e+01 + ME 2.210014151109632e-08 + +Event 80 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.717649893571369e+02 6.886949171513174e+01 -2.685228819958284e+02 -2.477128625390008e+02 + 3 3.869685808114618e+02 5.957956738527032e+01 2.814529638922007e+02 2.588033747881130e+02 + 4 3.309739394987644e+02 1.882716768699517e+02 -1.270374363062545e+01 -2.719120063801442e+02 + 5 4.102924903326370e+02 -3.167207359703538e+02 -2.263382657468468e-01 2.608214941310321e+02 + ME 3.603319636600762e-09 + +Event 81 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.842246424902095e+02 -2.267486624627472e+02 -2.937281983436480e+02 -9.968633540199460e+01 + 3 2.254487557377375e+02 2.209193002070300e+01 1.769877698127916e+02 -1.378927751129431e+02 + 4 4.703100720128003e+02 -6.491977305379048e+01 4.323580934137869e+02 1.733305107675069e+02 + 5 4.200165297592529e+02 2.695765054958347e+02 -3.156176648829309e+02 6.424859974743073e+01 + ME 1.873826280780797e-08 + +Event 82 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.929255068593493e+02 -4.355325407251513e+02 -3.510627019846902e+02 -1.965376348924526e+02 + 3 7.260108906535530e+01 -5.579709963741175e+01 7.472103649087548e+00 -4.584505941304569e+01 + 4 5.079399281835098e+02 3.053634150176129e+02 1.183021662560418e+02 3.882792202410245e+02 + 5 3.265334758917857e+02 1.859662253449502e+02 2.252884320795608e+02 -1.458965259355263e+02 + ME 3.848767623773752e-07 + +Event 83 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.902208799106456e+02 9.347119462039889e+01 -6.680743332736718e+01 2.665109036504028e+02 + 3 6.713082397370665e+02 3.520657855373974e+01 4.944193978438478e+02 -4.527302823990738e+02 + 4 1.855962177862651e+02 -4.627668248951797e+01 -1.740298043062298e+02 -4.492273285839192e+01 + 5 3.528746625660227e+02 -8.240109068462064e+01 -2.535821602102507e+02 2.311421116070632e+02 + ME 5.872831276142019e-07 + +Event 84 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.787180416895204e+02 1.865558524434927e+02 1.788512602743487e+02 1.043689962613429e+02 + 3 5.712981506042241e+02 2.561938996798008e+02 -3.762285149930340e+02 -3.452511653228694e+02 + 4 4.097077405278722e+02 -4.006461115622685e+02 2.118683226805500e+01 8.303158481526178e+01 + 5 2.402760671783825e+02 -4.210364056102516e+01 1.761904224506299e+02 1.578505842462649e+02 + ME 5.209305651329578e-08 + +Event 85 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.203351269117375e+02 -1.925479700358236e+02 -6.940689928982219e+01 5.855968162880135e+02 + 3 5.381473375593999e+02 9.356601590033819e+01 1.871475275927688e+02 -4.958061723160798e+02 + 4 1.334694954173727e+02 6.261202027528761e+01 -5.387043968641776e+01 1.048418659315403e+02 + 5 2.080480401114899e+02 3.636993386019776e+01 -6.387018861652887e+01 -1.946325099034741e+02 + ME 8.437936743198283e-08 + +Event 86 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.865599090092865e+02 -2.891134797262822e+02 -7.185603877268257e+01 2.463304057272006e+02 + 3 2.932777885173375e+02 -1.951329295480071e+02 1.024098870135540e+02 -1.935128318368158e+02 + 4 5.494991859904237e+02 4.929671188610552e+02 -1.532307333884744e+02 1.882899823973622e+02 + 5 2.706631164829519e+02 -8.720709586765839e+00 1.226768851476031e+02 -2.411075562877469e+02 + ME 8.114587380173013e-09 + +Event 87 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.398434642042320e+02 -5.773916908620570e+01 -2.210310523256464e+02 7.305032226170285e+01 + 3 6.256736457816573e+02 -1.570961431352854e+02 3.525379921364366e+02 -4.924482479795603e+02 + 4 2.661332101449429e+02 1.721182608911090e+02 1.257989686761089e+02 1.593010021622582e+02 + 5 3.683496798691676e+02 4.271705133038222e+01 -2.573059084868991e+02 2.600969235555993e+02 + ME 2.459828582291058e-07 + +Event 88 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.084047713119920e+02 -9.094715275662556e+01 -2.599647375380899e+01 2.935409711195911e+02 + 3 6.399024272101833e+02 3.541182238176332e+02 -3.940685931943562e+02 -3.588667493200646e+02 + 4 2.347172247375547e+02 -1.607307767247329e+02 1.246137179386500e+02 1.171717299670462e+02 + 5 3.169755767402702e+02 -1.024402943362748e+02 2.954513490095154e+02 -5.184595176657293e+01 + ME 7.925392943020492e-08 + +Event 89 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.184418668698222e+02 5.725262999818231e+02 2.132430012949283e+02 9.597604349672920e+01 + 3 1.648425597874048e+02 -1.197110850513234e+02 -9.455524227052834e+01 -6.246304341176983e+01 + 4 5.018918856507312e+02 -4.440754270637875e+02 -1.508485608523585e+02 1.787097972941989e+02 + 5 2.148236876920411e+02 -8.739787866711959e+00 3.216080182795854e+01 -2.122227973791583e+02 + ME 2.292709257217331e-07 + +Event 90 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.957637279642277e+02 1.263767639048391e+02 1.039632395279027e+02 1.074429874782744e+02 + 3 6.255717748556117e+02 2.485042080720027e+02 -5.052377532279100e+02 -2.726178951934749e+02 + 4 5.792311181609889e+02 -2.996537209891231e+02 4.496683582853566e+02 2.086017816327423e+02 + 5 9.943337901917147e+01 -7.522725098771859e+01 -4.839384458534928e+01 -4.342687391754170e+01 + ME 3.651661570809365e-08 + +Event 91 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.017185800766916e+02 -1.626043578155511e+02 -4.726483587494785e+02 -4.341526672713017e+01 + 3 1.532087699529515e+02 4.558627180762441e+01 -1.068434756756598e+02 -9.989639992873859e+01 + 4 1.553335691954189e+02 1.187326020855014e+02 -7.919197047207838e+01 -6.131654535069170e+01 + 5 6.897390807749382e+02 -1.714516077574787e+00 6.586838048972166e+02 2.046282120065602e+02 + ME 1.987036800648648e-06 + +Event 92 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.316508895826993e+02 2.423717673725613e+02 -3.493294830668556e+02 7.448039418853942e+01 + 3 4.088752336424421e+02 -4.083682512007605e+02 -4.296296540486002e+00 1.989648042821915e+01 + 4 4.838676110831551e+02 3.367111480297710e+02 3.431854235720047e+02 -5.456402565332682e+01 + 5 1.756062656917031e+02 -1.707146642015718e+02 1.044035603533698e+01 -3.981284896343173e+01 + ME 4.816430660826380e-09 + +Event 93 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.625352690202037e+02 -2.779056014020862e+02 -1.510359273299078e+02 -1.771678488002014e+02 + 3 4.496460928618028e+02 1.458684829985394e+02 -2.529482449111344e+02 -3.419373917692632e+02 + 4 5.819497911300330e+02 7.907814180644715e+01 3.218768382402446e+02 4.783382776751111e+02 + 5 1.058688469879609e+02 5.295897659709959e+01 8.210733400079756e+01 4.076696289435349e+01 + ME 2.730449779071184e-06 + +Event 94 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.529640986072860e+02 -9.457834198297702e+01 -4.155446162454761e+01 1.128103692467343e+02 + 3 5.452273502833526e+02 1.506136657220611e+02 3.951184420445460e+02 -3.441944275421371e+02 + 4 5.763591625604009e+02 -1.414467252236212e+02 -5.409858543903306e+02 1.397033054025703e+02 + 5 2.254493885489614e+02 8.541140148453748e+01 1.874218739703321e+02 9.168075289283261e+01 + ME 5.910846564707206e-08 + +Event 95 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.175930213981758e+02 1.049723762521990e+02 5.988073145300187e+02 1.087738036893139e+02 + 3 2.127444997197512e+02 1.110742673830454e+02 1.426661177414861e+01 -1.808844743885238e+02 + 4 3.734458243090436e+02 -6.700755739503158e+01 -3.574088045654865e+02 8.503362490734189e+01 + 5 2.962166545730298e+02 -1.490390862402128e+02 -2.556651217386810e+02 -1.292295420813228e+01 + ME 4.906535489502023e-07 + +Event 96 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.336643939052876e+02 5.655461284496032e+01 -2.475517668243394e+02 -3.515458790106136e+02 + 3 3.201335954241759e+02 2.737812519173168e+02 1.122352583779800e+02 1.221989844255047e+02 + 4 2.711474972504126e+02 -1.056171090554795e+02 1.641528331527196e+02 -1.882015911406193e+02 + 5 4.750545134201238e+02 -2.247187557067975e+02 -2.883632470636025e+01 4.175484857257282e+02 + ME 7.554491167226350e-09 + +Event 97 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.808150504334332e+02 -2.215516143647532e+02 2.522196782723237e+02 1.797782486876414e+02 + 3 1.725086962483917e+02 -1.958631536182568e+01 -1.200850556071191e+01 -1.709719928118794e+02 + 4 6.244594303322474e+02 3.979033339800429e+02 -4.294715131468291e+02 -2.172020633737499e+02 + 5 3.222168229859279e+02 -1.567654042534641e+02 1.892603404352174e+02 2.083958074979878e+02 + ME 4.808761944638582e-05 + +Event 98 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.497882471555648e+02 -2.192521180269852e+02 2.018837878810383e+02 -3.368692805982171e+02 + 3 3.552361781915541e+02 9.190647471572817e+01 -1.700004974647007e+02 2.980700807917881e+02 + 4 3.423969339559751e+02 -8.430327104189779e+01 -1.816338984228904e+02 2.777368283425729e+02 + 5 3.525786406969056e+02 2.116489143531549e+02 1.497506080065527e+02 -2.389376285361441e+02 + ME 2.622877009116387e-06 + +Event 99 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.510905676127443e+02 -1.120176118685159e+02 1.002985425558890e+02 -1.485316848286259e+01 + 3 6.608990669934884e+02 5.206564204577553e+02 -3.715703155588056e+02 1.662527268483448e+02 + 4 4.055868262761768e+02 -2.099131999005440e+02 1.513461738821255e+02 -3.123002686638891e+02 + 5 2.824235391175902e+02 -1.987256086886954e+02 1.199255991207911e+02 1.609007102984069e+02 + ME 2.152474006036951e-07 + +Event 100 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.912277729130220e+02 -3.038923856333701e+02 -3.681394565520995e+02 -1.158770182765827e+02 + 3 3.291961677177790e+02 9.682949861260765e+01 4.959602220292174e+01 -3.106998545544404e+02 + 4 1.561943773057186e+02 1.080498253360949e+02 -6.909857434628780e+01 8.914766272915458e+01 + 5 5.233816820634804e+02 9.901306168466756e+01 3.876420086954656e+02 3.374292101018685e+02 + ME 1.690671035087832e-07 + +Event 101 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.606145981152812e+02 1.415165413755632e+02 -1.858039240081086e+02 1.156284527747264e+02 + 3 3.602882052374051e+02 2.118000521994822e+02 1.364734610909919e+02 2.575331495930820e+02 + 4 4.124277552237737e+02 -3.176993998179823e+02 2.596287687502777e+02 4.191237372697503e+01 + 5 4.666694414235399e+02 -3.561719375706301e+01 -2.102983058331611e+02 -4.150739760947835e+02 + ME 5.007627759721156e-08 + +Event 102 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.668883721914397e+02 -1.522538817869545e+02 1.714625751179097e+02 6.262263502751124e+02 + 3 1.233366340807367e+02 9.011481525477787e+01 7.793251741380507e+01 -3.190247807788264e+01 + 4 1.269706430435817e+02 -4.043133476762941e+01 5.414615182810898e+01 -1.074943980215522e+02 + 5 5.828043506842412e+02 1.025704012998064e+02 -3.035412443598237e+02 -4.868294741756773e+02 + ME 5.779429806445845e-06 + +Event 103 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.552864941980412e+02 -3.167444776289481e+02 2.214161018629589e+02 -5.291958382354644e+02 + 3 1.601276089464259e+02 -7.094294880163687e+01 -2.470441050126062e+01 1.414130166003577e+02 + 4 5.894184034750449e+02 3.412800894354302e+02 -1.500599500250239e+02 4.565347372418695e+02 + 5 9.516749338048808e+01 4.640733699515487e+01 -4.665174133667438e+01 -6.875191560676254e+01 + ME 8.777290536240856e-06 + +Event 104 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.774569993979300e+02 1.989410031267868e+02 4.017120069248927e+02 1.643627909271369e+02 + 3 4.609712540524605e+01 4.243610185127382e+01 -1.704461951459786e+01 -5.796824639221473e+00 + 4 7.261932715446645e+02 -3.396385607991187e+02 -6.122865574881962e+02 -1.926330375981751e+02 + 5 2.502526036521592e+02 9.826145582105819e+01 2.276191700779015e+02 3.406707131025970e+01 + ME 5.198252253578472e-05 + +Event 105 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.921789576562786e+02 4.810279062964444e+02 1.348719512102337e+02 4.790964616472720e+02 + 3 8.813504188469318e+01 -2.537319850765734e+01 7.858109277220778e+01 3.080581542880035e+01 + 4 6.360767083273017e+01 -3.678422527687846e+01 -3.637890142810105e+01 -3.700583859919021e+01 + 5 6.560783296262981e+02 -4.188704825119086e+02 -1.770741425543403e+02 -4.728964384768820e+02 + ME 3.135288710202953e-04 + +Event 106 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.864390473248632e+02 2.130816993347910e+02 -9.460577540765883e+00 1.911910424511779e+02 + 3 3.195711363524375e+02 -1.238775433747336e+02 -2.683784353696865e+02 1.214622610753980e+02 + 4 5.745504627825516e+02 2.049135896350913e+02 3.226669556919595e+02 -4.289576794638239e+02 + 5 3.194393535401478e+02 -2.941177455951488e+02 -4.482794278150708e+01 1.163043759372481e+02 + ME 3.217019824895859e-08 + +Event 107 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.722313928324550e+02 -1.968575188090820e+02 1.163894612093211e+02 -2.936951147324815e+02 + 3 5.656198121119700e+02 -1.543046273498364e+02 8.076299742136007e+01 5.381386364864783e+02 + 4 4.865572520387373e+01 -4.212119983306868e+01 2.435239455081532e+01 3.807878398712479e-01 + 5 5.134930698516997e+02 3.932833459919868e+02 -2.215048531814960e+02 -2.448243095938672e+02 + ME 9.215289556294778e-06 + +Event 108 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.379979300943136e+02 -3.192200473520488e+02 -3.606896713798772e+02 5.591444063494898e+02 + 3 1.260099028519024e+01 3.978889775076310e+00 1.147615618471866e+01 3.354285549740169e+00 + 4 7.222347693662582e+02 2.991742959970567e+02 3.284329357345791e+02 -5.694291961437907e+02 + 5 2.716631025423775e+01 1.606686157991582e+01 2.078057946057941e+01 6.930504244560661e+00 + ME 2.832408657659794e-03 + +Event 109 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.231511027318798e+02 2.454122241586620e+02 -1.092654998279543e+02 -1.796121543150014e+02 + 3 1.956814867717928e+02 1.324393710405636e+02 -9.556622719590956e+01 -1.077875386264064e+02 + 4 5.871682488784803e+02 -4.565221017188250e+02 -1.796226224478535e+02 3.226295656603770e+02 + 5 3.939991616178473e+02 7.867050651959941e+01 3.844543494717173e+02 -3.522987271896914e+01 + ME 2.548053674525758e-05 + +Event 110 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.199007184505406e+01 3.732711745037530e+01 1.830560612245135e+01 5.895525497219112e+00 + 3 6.503495414798850e+02 6.292604094065496e+02 -1.169464273078949e+02 1.153663572993341e+02 + 4 5.826055008056770e+02 -5.629196868847104e+02 1.294559693061781e+02 7.610353275407952e+01 + 5 2.250548858693839e+02 -1.036678399722146e+02 -3.081514812073458e+01 -1.973654155506326e+02 + ME 5.938797557675147e-06 + +Event 111 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.499685704497379e+02 1.681858109101244e+02 -7.955231263377713e+01 -1.669408541244689e+02 + 3 2.500774020307764e+02 -9.118504148310892e+01 1.285070504366491e+02 -1.941904560071345e+02 + 4 4.449919367221886e+02 -3.058493380776594e+02 3.227017355837655e+02 1.837378557748529e+01 + 5 5.549620907972972e+02 2.288485686506438e+02 -3.716564733866373e+02 3.427575245541182e+02 + ME 1.863121094452810e-07 + +Event 112 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.601284604146208e+02 -2.488585849081574e+02 -5.878734146366801e+02 1.680590293030910e+02 + 3 3.001424246875746e+02 -1.647249933729960e+01 1.562495115994508e+02 -2.557346710711166e+02 + 4 3.055316759483335e+02 9.306007508284051e+01 2.734221395002294e+02 9.964818645861379e+01 + 5 2.341974389494710e+02 1.722710091626164e+02 1.582017635369998e+02 -1.197254469058826e+01 + ME 1.698614883714245e-07 + +Event 113 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.646683093846071e+02 2.910998267486567e+02 3.270673291512863e+02 1.555779252178637e+02 + 3 1.032835588102810e+02 6.414528091080065e+01 -8.089516327050815e+01 -2.974729706893166e+00 + 4 2.576709858897332e+02 5.223657579993225e+01 2.415431221983603e+02 7.295613227654690e+01 + 5 6.743771459153787e+02 -4.074816834593896e+02 -4.877152880791386e+02 -2.255593277875174e+02 + ME 2.603347606813896e-08 + +Event 114 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.292401292391439e+02 -4.020985695220133e+02 -3.022214392513490e+02 -1.645419595187189e+02 + 3 2.852547491939838e+02 5.684779875308552e+01 2.983811334987576e+01 -2.779357600352486e+02 + 4 5.503881444993327e+02 3.517711961045184e+02 2.897049565647914e+02 3.086343683687984e+02 + 5 1.351169770675392e+02 -6.520425335590622e+00 -1.732163066331816e+01 1.338433511851691e+02 + ME 1.089183309357228e-06 + +Event 115 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.315689135772182e+02 1.523084328556000e+02 1.421486881719875e+02 1.010942800923345e+02 + 3 4.774006609503493e+02 -1.978437903340176e+02 -1.809847573733626e+02 3.949857507564100e+02 + 4 1.705370598210000e+02 -1.094359051505278e+02 -9.529413476772473e+01 8.958626744734215e+01 + 5 6.204933656514328e+02 1.549712626289454e+02 1.341302039690998e+02 -5.856662982960866e+02 + ME 3.653127636476731e-06 + +Event 116 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.746322505032659e+02 1.490212136649899e+02 -3.941831318833103e+01 -6.567857701582012e+02 + 3 3.002577903546940e+02 2.698491623731489e+01 1.898406713834985e+02 2.310564313171374e+02 + 4 3.649703499796597e+02 -2.118033082945360e+02 -1.168367756298048e+02 2.732981573669968e+02 + 5 1.601396091623805e+02 3.579717839223130e+01 -3.358558256536295e+01 1.524311814740669e+02 + ME 1.924015596474713e-06 + +Event 117 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.989590248458574e+02 1.044204896489003e+02 1.692644965960479e+02 -5.530379840932109e+00 + 3 4.228744835378425e+02 3.264095412410632e+02 -1.699630168106393e+02 -2.083079766375305e+02 + 4 4.224386223860297e+02 -2.861508131031816e+02 2.925187696477572e+02 -1.049041050081309e+02 + 5 4.557278692302706e+02 -1.446792177867818e+02 -2.918202494331658e+02 3.187424614865934e+02 + ME 2.002158878826574e-08 + +Event 118 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.729668676760357e+02 1.184007139072070e+02 5.539744922226419e+02 -8.593348804405296e+01 + 3 2.157021508192258e+02 -1.942718312667870e+02 -3.003467563250634e+01 8.879071856866089e+01 + 4 6.030943128170970e+02 6.761820676461598e+01 -5.931358380448543e+02 -8.567616875825117e+01 + 5 1.082366686876419e+02 8.252910594964055e+00 6.919602145471869e+01 8.281893823364319e+01 + ME 6.473967683308769e-08 + +Event 119 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.846061745194157e+02 -5.587431713332445e+01 2.848851387904505e+02 -2.522705449704535e+02 + 3 3.723414251304719e+02 -7.909406967610428e+01 2.575223087846817e+02 2.570302034556823e+02 + 4 2.507014391707248e+02 -1.454225725560076e+02 -1.575427155496305e+02 -1.299375995265171e+02 + 5 4.923509611793880e+02 2.803909593654362e+02 -3.848647320255017e+02 1.251779410412885e+02 + ME 7.269946299335500e-09 + +Event 120 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.122124974385885e+02 -1.805344246922925e+02 -4.186310826559473e+01 3.682036728658804e+02 + 3 4.103314828537498e+02 1.875880572101384e+02 3.493840084974134e+02 1.054204022917875e+02 + 4 2.606101940762739e+02 1.386532314613984e+02 -1.016069521191985e+01 -2.204307486141520e+02 + 5 4.168458256313883e+02 -1.457068639792442e+02 -2.973602050198988e+02 -2.531933265435160e+02 + ME 1.723302641567316e-07 + +Event 121 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.648818809703109e+02 9.112562142302345e+01 5.529617988614829e+02 3.577670831179963e+02 + 3 1.821081589233734e+02 -1.118989809065603e+02 -1.417154481828441e+02 -2.363749912468558e+01 + 4 3.177811324832343e+02 1.923141187866275e+02 -1.574082799297883e+02 -1.980473713210938e+02 + 5 3.352288276230817e+02 -1.715407593030907e+02 -2.538380707488505e+02 -1.360822126722171e+02 + ME 3.557315883091281e-08 + +Event 122 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.318877662083167e+02 -1.287773968646638e+02 1.171672255817830e+01 -2.595155765412093e+01 + 3 5.737869076225666e+02 4.983007023876717e+02 -1.830722893772106e+02 -2.177437995011738e+02 + 4 4.899092372804379e+02 -1.418218373570128e+02 1.492994579099249e+02 4.445304253807427e+02 + 5 3.044160888886795e+02 -2.277014681659951e+02 2.205610890910771e+01 -2.008350682254479e+02 + ME 1.612063977111499e-06 + +Event 123 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.617768154532660e+02 -5.530096067488934e+01 -2.708467093957605e+02 3.698941619545819e+02 + 3 4.024250741463961e+02 3.719307063830553e+02 -8.950215258237559e+01 -1.249113871280559e+02 + 4 5.686589162433559e+02 -3.599400046259340e+02 3.327673296178380e+02 -2.882395885924826e+02 + 5 6.713919415698199e+01 4.331025891776792e+01 2.758153236029808e+01 4.325681376595668e+01 + ME 8.457469480303950e-07 + +Event 124 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.379698547837552e+02 -1.418344380556738e+02 4.784715070099305e+02 -2.008969233640272e+02 + 3 6.969091655902267e+02 1.102655237134664e+02 -6.144259960763416e+02 3.098460822883783e+02 + 4 1.204076926314279e+02 -5.195962553876777e+01 1.084548321656371e+02 5.979894532426238e+00 + 5 1.447132869945906e+02 8.352853988097525e+01 2.749965690077406e+01 -1.149290534567774e+02 + ME 1.777632406128968e-07 + +Event 125 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.993265030449273e+02 -2.447949992392673e+02 -4.282124327582629e+02 3.404435019658305e+02 + 3 6.731775329270110e+01 -2.804109752229690e+01 -6.119041173487811e+01 1.053693426996913e+00 + 4 1.243834765225636e+02 -1.197301671655536e+02 -1.282222238280984e+01 -3.116932651684169e+01 + 5 7.089722671398083e+02 3.925662639271179e+02 5.022250668759508e+02 -3.103278688759857e+02 + ME 5.425574650973898e-06 + +Event 126 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.963413774872839e+02 -2.028966757095202e+02 -8.071348940097566e+01 -2.003409185009524e+02 + 3 5.505085443412619e+02 -3.540820098777998e+02 -1.472538634584982e+02 3.949707424230037e+02 + 4 5.980459330604077e+02 5.127370036025512e+02 2.537494953815608e+02 -1.742724785511861e+02 + 5 5.510414511104670e+01 4.424168198476884e+01 -2.578214252208699e+01 -2.035734537086466e+01 + ME 6.044703153789196e-07 + +Event 127 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.341333562245315e+02 1.279906072472883e+02 1.954399695910119e+02 1.549358705071583e+01 + 3 2.952995190591674e+02 2.065736836642997e+00 4.076907777821770e+01 -2.924643926788540e+02 + 4 3.901989577777040e+02 -5.888488716750147e+01 3.336956288095710e+02 1.934813273353844e+02 + 5 5.803681669385967e+02 -7.117145691642978e+01 -5.699046761788005e+02 8.348947829275392e+01 + ME 3.932058090274088e-08 + +Event 128 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.006194261205288e+02 1.970078759501975e+02 2.223923104087646e+01 -3.068334325383111e+01 + 3 4.169679269604398e+02 1.948380004768638e+02 -3.622744174691725e+02 -6.824699356921091e+01 + 4 4.053889184975367e+02 7.500867626306021e+01 3.464478799306621e+02 1.966919932722616e+02 + 5 4.770237284214948e+02 -4.668545526901215e+02 -6.412693502366069e+00 -9.776165644921963e+01 + ME 1.043565175740389e-08 + +Event 129 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.252915081508855e+02 -9.538237368340913e+01 -7.350539936684781e+01 5.113025792912360e+02 + 3 3.101175064357083e+02 2.980876593097250e+02 -2.924507148532651e+01 8.038246673851862e+01 + 4 3.069384995973983e+02 -1.872130733221282e+02 3.995859154565957e+01 -2.399287783365572e+02 + 5 3.576524858160083e+02 -1.549221230418743e+01 6.279187930651464e+01 -3.517562676931971e+02 + ME 1.000128208635455e-05 + +Event 130 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.459228778168211e+02 2.565546925335569e+02 -2.098380620336044e+02 5.544042221593890e+02 + 3 3.439568781167832e+02 -2.720151357658145e+02 1.417213346239043e+02 -1.556571978116284e+02 + 4 4.116171180733708e+02 -4.853282653097603e+01 -6.103612751111103e+00 -4.087003334367686e+02 + 5 9.850312599302553e+01 6.399326976323382e+01 7.422034016081116e+01 9.953309089007998e+00 + ME 2.054246195557976e-06 + +Event 131 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.691384726730906e+02 9.209415983561641e+01 8.878922249924507e+01 -3.462634137467797e+02 + 3 2.941454650704778e+02 5.402596827688080e+01 5.252299287417021e+01 2.843309420249809e+02 + 4 3.384067227193843e+02 -4.862827719487453e+01 3.183719183147821e+02 -1.038928403081678e+02 + 5 4.983093395370473e+02 -9.749185091762266e+01 -4.596841336881973e+02 1.658253120299667e+02 + ME 1.996541047993167e-08 + +Event 132 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.707345198779215e+02 6.870837694602629e+01 -2.400964784749993e+02 1.045467369881798e+02 + 3 4.279656548163086e+02 3.103112050689890e+02 2.902864584573433e+02 5.094437895662995e+01 + 4 4.190719066649796e+02 -1.246682736407299e+02 1.263571350494781e+02 -3.796221264960058e+02 + 5 3.822279186407902e+02 -2.543513083742854e+02 -1.765471150318222e+02 2.241310105511960e+02 + ME 3.089894195205159e-08 + +Event 133 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.929720700957801e+02 5.142339919817747e+02 2.604838772561675e+02 -3.846060856420667e+02 + 3 3.148753114588355e+01 -2.203036177987944e+01 1.741009785128097e+01 -1.424837781361217e+01 + 4 2.496211560069212e+02 -1.554620265425121e+02 -1.952705634097227e+02 3.418610123488798e+00 + 5 5.259192427514154e+02 -3.367416036593830e+02 -8.262341169772598e+01 3.954358533321899e+02 + ME 1.942621628891668e-06 + +Event 134 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.197916906084879e+02 6.019490617396247e+01 -8.116208159236564e+01 -6.433769440337215e+01 + 3 5.683824394481229e+02 1.081755691462717e+02 -5.398427847909500e+02 1.411609415597359e+02 + 4 4.816985571718548e+02 -2.884024598071592e+02 3.717366705283752e+02 1.032926374099569e+02 + 5 3.301273127715344e+02 1.200319844869250e+02 2.492681958549405e+02 -1.801158845663207e+02 + ME 9.649486812359519e-08 + +Event 135 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.350166624422527e+02 -2.198562968051228e+02 -3.373781523021508e+01 -2.505217209995491e+02 + 3 5.912215178852441e+02 -2.771944434106617e+01 -4.569913276888497e+02 3.740767862794175e+02 + 4 2.502486776794080e+02 6.054245701130878e+01 2.317565313843600e+02 -7.244254095499564e+01 + 5 3.235131419930952e+02 1.870332841348801e+02 2.589726115347048e+02 -5.111252432487285e+01 + ME 5.191826702421519e-07 + +Event 136 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.090067243858946e+02 -5.732442171651003e+01 7.824626590657545e+01 4.973830021108071e+01 + 3 6.815640342145938e+02 -4.337142854700793e+02 -4.067901902413736e+02 3.330813600870478e+02 + 4 2.032618343052463e+02 1.129685178335902e+02 1.212961788254292e+02 -1.176466075412493e+02 + 5 5.061674070942649e+02 3.780701893529990e+02 2.072477455093691e+02 -2.651730527568794e+02 + ME 1.434646277503734e-05 + +Event 137 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.368852252705567e+02 2.060183932929131e+02 2.064637249428417e+02 -3.252658173977681e+02 + 3 3.381060129384422e+02 1.187665134594072e+02 -2.812383212949928e+02 1.453106943793225e+02 + 4 4.559721413906875e+02 -1.886887365516218e+02 2.518423412375507e+02 3.299736195257900e+02 + 5 2.690366204003138e+02 -1.360961702006985e+02 -1.770677448853996e+02 -1.500184965073444e+02 + ME 2.210571785846130e-07 + +Event 138 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.281444835382400e+02 -7.170216505610387e+01 8.048316021729237e+01 3.099355780227290e+02 + 3 4.195412269153062e+02 -8.233072610126706e+01 3.843189873918888e+02 1.467494754720159e+02 + 4 5.954423007595251e+02 2.591445083415473e+02 -3.916019632055438e+02 -3.661195975958926e+02 + 5 1.568719887869287e+02 -1.051116171841762e+02 -7.320018440363728e+01 -9.056545589885236e+01 + ME 2.700955165177189e-07 + +Event 139 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.608240142934881e+02 -1.978529853543029e+02 -1.693530849717742e+02 -1.424412963654430e+01 + 3 4.955759006510129e+02 -4.667016398458183e+02 -1.515076386550367e+02 6.950171293185227e+01 + 4 6.747340014191125e+02 6.026021606891686e+02 2.951526224468366e+02 -7.086281159076066e+01 + 5 6.886608363638693e+01 6.195246451095253e+01 2.570810117997448e+01 1.560522829545276e+01 + ME 2.378175597656896e-04 + +Event 140 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.602659806312456e+02 -3.561732527960755e+02 2.435133213622241e+01 -4.836531211568239e+01 + 3 3.725079748856115e+02 -2.434506824135563e+02 -2.792614056347936e+02 -3.882040582776654e+01 + 4 4.679082590584683e+02 4.480807144504857e+02 8.348713917835200e+01 -1.057908777942769e+02 + 5 2.993177854246740e+02 1.515432207591460e+02 1.714229343202193e+02 1.929765957377260e+02 + ME 3.021653806870066e-08 + +Event 141 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.518456049687267e+02 -1.447323876721944e+02 -2.797778873058075e+02 -4.531119524540314e+02 + 3 3.714883419349470e+02 1.931277822064916e+02 8.611260447142450e+01 3.054339000254000e+02 + 4 3.355296667174946e+02 -2.406718940601041e+02 6.932072359930871e+01 2.232752425521988e+02 + 5 2.411363863788320e+02 1.922764995258070e+02 1.243445592350743e+02 -7.559719012356739e+01 + ME 5.558624997903814e-07 + +Event 142 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.034813172564560e+02 -2.283824334287207e+02 -1.981635898359731e+02 -2.595314409103105e+01 + 3 6.163677719793856e+02 6.075815331760194e+02 1.023397473350790e+02 1.674774592897086e+01 + 4 2.855126706949896e+02 -1.929815176467484e+02 2.738809148793404e+01 -2.086276861272057e+02 + 5 2.946382400691695e+02 -1.862175821005503e+02 6.843575101296037e+01 2.178330842892659e+02 + ME 1.484999423386209e-08 + +Event 143 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.765102965824574e+02 6.404671225068569e+01 -2.523880643688530e+02 -9.303885064928588e+01 + 3 4.906331167889081e+02 4.654623693262335e+02 -8.642087129466319e+01 1.288296201806038e+02 + 4 3.107154638669692e+02 -1.787822362222276e+02 1.647042861868789e+02 -1.935290924102013e+02 + 5 4.221411227616653e+02 -3.507268453546915e+02 1.741046494766373e+02 1.577383228788835e+02 + ME 1.469564558839036e-08 + +Event 144 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.275344822318363e+02 -5.170608027367085e+02 7.533492100171161e+01 -7.256311662648037e+01 + 3 8.007363982231995e+01 5.428447222359200e+01 4.041177195768913e+00 -5.872523100570221e+01 + 4 6.090813110431083e+02 5.981821680688865e+02 -2.587994404401105e+01 -1.117513568717483e+02 + 5 2.833105669027354e+02 -1.354058375557700e+02 -5.349615415346945e+01 2.430397045039308e+02 + ME 8.201568885466726e-07 + +Event 145 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.325096538893530e+02 -2.426159535975006e+02 -3.315129682972141e+02 -3.388261831619632e+02 + 3 2.284791393037104e+02 -1.566914323798220e+02 8.733052769045943e+01 1.415057986410094e+02 + 4 4.231517474846562e+02 4.111259958116378e+02 7.341849952547859e+01 -6.813619371989205e+01 + 5 3.158594593222808e+02 -1.181860983431524e+01 1.707639410812761e+02 2.654565782408458e+02 + ME 8.118145238090913e-09 + +Event 146 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.853154876963473e+02 -1.115425014580856e+02 2.346915094935451e+00 4.723175679095090e+02 + 3 1.780978034776686e+02 -1.279353210791639e+02 1.013248544366916e+02 7.130676754225198e+01 + 4 2.669235424450998e+02 2.236642823219127e+02 1.425247145974823e+02 -3.015248006003818e+01 + 5 5.696631663808843e+02 1.581354021533681e+01 -2.461964841291094e+02 -5.134718553917228e+02 + ME 1.085594956395671e-06 + +Event 147 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.616553850833652e+02 -3.952525530052052e+02 -1.318566300751644e+02 1.987836588468953e+02 + 3 2.908739185877554e+02 2.228658584604866e+02 -1.479014920521994e+02 -1.142960817176148e+02 + 4 2.305801020752398e+02 -4.980254073033321e+01 -8.282806011201673e+01 2.093475647527614e+02 + 5 5.168905942536403e+02 2.221892352750518e+02 3.625861822393805e+02 -2.938351418820419e+02 + ME 8.015295559309881e-09 + +Event 148 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.986236365167860e+02 2.143922541391385e+02 7.424054753538570e+01 -1.941658537263819e+02 + 3 5.431300986327868e+02 -2.806204744558454e+02 1.772213579739537e+02 4.299244627081655e+02 + 4 2.075257794515811e+02 -1.671248660321451e+01 1.739190576128833e+02 -1.119812632879726e+02 + 5 4.507204853988461e+02 8.294070691992134e+01 -4.253809631222228e+02 -1.237773456938111e+02 + ME 1.835486046277023e-08 + +Event 149 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.606702652730058e+02 1.482150598519967e+02 5.691576771795033e+01 -2.465411854119753e+01 + 3 5.470495180870233e+02 2.976789640827242e+02 4.494205378211043e+02 -9.312137115866565e+01 + 4 8.855244506584305e+01 2.448792744151061e+01 -2.255805030539081e+01 -8.205492857335213e+01 + 5 7.037277715741277e+02 -4.703819513762315e+02 -4.837782552336639e+02 1.998304182732153e+02 + ME 1.005685036833820e-06 + +Event 150 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.589228438361085e+02 -4.599888281396885e+01 -1.940699740613964e+02 1.651120438001223e+02 + 3 3.672343011246722e+02 -1.252842613728790e+02 -3.439632022826294e+02 -2.922672151058827e+01 + 4 2.630407655820261e+02 -2.136932482478519e+02 6.383958259969681e+01 -1.394637863560173e+02 + 5 6.108020894571931e+02 3.849763924346998e+02 4.741935937443290e+02 3.578464066483268e+00 + ME 4.010956619074534e-08 + +Event 151 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.074288193573346e+02 1.543896703986275e+02 -1.365285657072381e+02 2.346264094481421e+01 + 3 1.187518680134530e+02 6.614859683574457e+01 2.146588978980582e+01 -9.625790808488716e+01 + 4 5.855490127465465e+02 -2.497256519667727e+01 4.593548187907377e+02 -3.622667080614523e+02 + 5 5.882702998826660e+02 -1.955657020376948e+02 -3.442921428733055e+02 4.350619752015253e+02 + ME 5.761547120959551e-07 + +Event 152 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.469187816458222e+02 3.346653279286311e+01 6.576737700245441e+00 2.445518808627398e+02 + 3 2.050488318670079e+02 2.024852188395404e+02 -3.198592830617966e+01 -4.654029643207459e+00 + 4 4.457639178610944e+02 1.763406247999739e+02 -3.941620330002511e+02 -1.106604999678583e+02 + 5 6.022684686260749e+02 -4.122923764323775e+02 4.195712236061854e+02 -1.292373512516739e+02 + ME 1.239502861741274e-07 + +Event 153 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.388118256285089e+02 -1.907263201164207e+02 2.746355408008762e+02 5.470140518836755e+01 + 3 4.321891825954877e+02 3.144038017692075e+02 9.731608721136165e+01 -2.801201851982382e+02 + 4 2.946402774128102e+02 -2.098835107053741e+02 -1.526675250199261e+02 -1.394791447212633e+02 + 5 4.343587143631938e+02 8.620602905258721e+01 -2.192841029923117e+02 3.648979247311337e+02 + ME 6.898610066466821e-08 + +Event 154 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.166018743478079e+02 2.716108235141308e+02 -4.142745723228110e+02 -5.178053549312899e+02 + 3 4.336582961481601e+02 -2.527703288032555e+02 2.658228397098244e+02 2.313112547703298e+02 + 4 1.758894695506488e+02 -5.524813141932554e+01 1.166356953616248e+02 1.195025691815528e+02 + 5 1.738503599533830e+02 3.640763670845024e+01 3.181603725136177e+01 1.669915309794072e+02 + ME 1.306099724491123e-05 + +Event 155 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.311412179822987e+02 4.570041908288398e+00 2.309160670256665e+02 -9.118517242124806e+00 + 3 1.127940168856364e+02 9.688938222147931e+01 5.762608966553505e+01 -3.764524909869577e+00 + 4 4.841340752681729e+02 1.199724179371482e+02 1.518135540047473e+02 -4.437849328115263e+02 + 5 6.719306898638911e+02 -2.214318420669162e+02 -4.403557106959487e+02 4.566679749635207e+02 + ME 5.329734049261943e-07 + +Event 156 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.939383624002053e+02 -5.584434953296783e+01 -1.074936322310988e+02 -1.514549968323210e+02 + 3 5.333164952329514e+02 -4.350263902124028e+02 2.486552239121395e+01 -3.075064709929486e+02 + 4 5.439544646395683e+02 2.632706271852282e+02 6.316511808396704e+01 4.717893643568649e+02 + 5 2.287906777272743e+02 2.276001125601419e+02 1.946299175591792e+01 -1.282789653159486e+01 + ME 2.999848247991722e-07 + +Event 157 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.312192489929148e+02 -1.585569120979517e+02 1.680571010706740e+02 -8.880175140797474e+00 + 3 4.848229799559913e+02 2.683425948717759e+02 -3.974996855026540e+02 7.098995489631305e+01 + 4 4.746943934695868e+02 8.517047965358651e+01 4.145824897329423e+02 -2.149467743276730e+02 + 5 3.092633775815074e+02 -1.949561624274110e+02 -1.851399053009623e+02 1.528369945721574e+02 + ME 6.751148578886882e-09 + +Event 158 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.018805202065023e+02 -1.753303435111356e+02 -2.369304707527991e+02 -5.247698383836233e+02 + 3 8.634821849561429e+01 -2.991645269896793e+01 7.632972427838881e+01 -2.710708185810307e+01 + 4 4.997026770426654e+02 3.739093325581301e+02 1.867465724157428e+01 3.309770893074773e+02 + 5 3.120685842552182e+02 -1.686625363480266e+02 1.419260892328361e+02 2.208998309342491e+02 + ME 1.235210652597672e-06 + +Event 159 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.303650459443540e+02 1.779155779553054e+02 1.687725002807366e+02 3.536608022392421e+02 + 3 5.511053464986991e+02 -4.854202618317907e+02 -2.099654990918665e+02 -1.549153366639674e+02 + 4 1.610168062150694e+02 -8.022454684444293e+00 -1.072391010449717e+02 -1.198408415841761e+02 + 5 3.575128013418772e+02 3.155271385609296e+02 1.484320998561018e+02 -7.890462399109860e+01 + ME 6.473917645843670e-08 + +Event 160 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.944516008522269e+02 -6.281446959798446e+01 1.015008487838442e+02 -1.535035673177559e+02 + 3 4.230235916153154e+02 -3.998222205375998e+02 -6.268267453255920e+01 -1.231342086544157e+02 + 4 6.245424625752946e+02 4.116166861635971e+02 1.496811615662379e+02 4.452196548935074e+02 + 5 2.579823449571634e+02 5.102000397198711e+01 -1.884993358175230e+02 -1.685818789213359e+02 + ME 2.025948042789642e-07 + +Event 161 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.155018654656784e+02 -7.068712627844450e+01 9.130062612448707e+01 2.864746729386987e+00 + 3 4.158111502215671e+02 2.244999292291257e+02 2.797316750395929e+02 -2.103541879883635e+02 + 4 2.674174137253543e+02 -2.901773797454458e+01 2.441511879743710e+02 -1.051676825697958e+02 + 5 7.012695705874006e+02 -1.247950649761367e+02 -6.151834891384510e+02 3.126571238287723e+02 + ME 1.399778033826709e-07 + +Event 162 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.251731088161691e+02 4.809658489651715e+02 2.183673650866463e+02 -3.344083760631616e+02 + 3 1.108846666575799e+02 1.085837095121457e+02 1.269400989282292e+01 -1.854317775316524e+01 + 4 4.812365383625106e+02 -3.785395977737376e+02 -2.468116052076603e+02 1.654702701612803e+02 + 5 2.827056861637401e+02 -2.110099607035797e+02 1.575023022819107e+01 1.874812836550462e+02 + ME 7.221635073705518e-06 + +Event 163 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.197406137989223e+02 3.484446545657324e+01 -2.083509112967882e+02 6.051196858118807e+01 + 3 2.294753481748079e+02 -4.289167430922016e+01 1.411072116241522e+02 1.758066964667666e+02 + 4 6.045742534248393e+02 -3.922481835179260e+01 -2.454372160591728e+02 -5.511188751100743e+02 + 5 4.462097846014299e+02 4.727202720443959e+01 3.126809157318089e+02 3.148002100621197e+02 + ME 7.488878615051639e-08 + +Event 164 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.294925844427889e+02 7.335206987339836e+01 3.211431815175450e+02 7.203749916922543e+00 + 3 4.456282610861912e+02 -2.140706694514152e+02 -2.874930256302156e+02 2.647754818205600e+02 + 4 5.123820383524132e+02 1.705541274100712e+02 -2.866217204657405e+01 -4.823122668387517e+02 + 5 2.124971161186064e+02 -2.983552783205417e+01 -4.987983840755348e+00 2.103330351012688e+02 + ME 4.841789872662523e-08 + +Event 165 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.181198415073811e+02 -4.576729557413967e+01 1.980904969172526e+01 2.123422272308199e+02 + 3 2.986822563453928e+02 -2.114010465926206e+02 -1.050902232874885e+02 -1.829664797838815e+02 + 4 6.054978258860255e+02 4.739070609379933e+02 3.590040584148246e+02 1.146987392585553e+02 + 5 3.777000762612001e+02 -2.167387187712331e+02 -2.737228848190614e+02 -1.440744867054938e+02 + ME 1.354049988290246e-08 + +Event 166 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.222232017840728e+02 3.192297607931550e+02 6.501391151310646e+00 -4.333455324590233e+01 + 3 3.053327974515881e+02 1.412228998217293e+02 2.069513876027933e+02 1.745145636754503e+02 + 4 4.336545371614683e+02 -3.424637997499572e+02 2.077212038433834e+02 -1.662128302370304e+02 + 5 4.387894636028710e+02 -1.179888608649270e+02 -4.211739825974873e+02 3.503281980748253e+01 + ME 8.791837821614621e-09 + +Event 167 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.729290336372008e+02 2.217891042804636e+02 -3.163933861400006e+01 -1.558807328590503e+02 + 3 1.537967125602960e+02 -5.805059168074693e+01 -5.204366344775281e+00 1.423252337810090e+02 + 4 6.018172045649106e+02 -4.859322890486577e+02 2.249815041196952e+02 -2.746581165272472e+02 + 5 4.714570492375926e+02 3.221937764489410e+02 -1.881377991609198e+02 2.882136156052885e+02 + ME 2.550988240672509e-08 + +Event 168 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.302714671417868e+01 -4.694579621882250e+01 -8.099628082448689e+00 -2.328876404951166e+01 + 3 5.269668821589169e+02 -9.568171326198457e+01 -5.105187840955386e+02 -8.893635772367716e+01 + 4 3.135748577105153e+02 3.091348665867710e+02 -5.194641375603634e+01 -8.148358511739488e+00 + 5 6.064311134163894e+02 -1.665073571059640e+02 5.705648259340236e+02 1.203734802849283e+02 + ME 1.441916509771820e-07 + +Event 169 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.789601344296162e+02 5.468556852911152e+02 -1.869602488084409e+02 3.449002883486752e+01 + 3 5.891143790802998e+02 -5.265905860904877e+02 1.639376395451868e+02 2.070810387756352e+02 + 4 1.164890241423856e+02 4.243473362119256e+01 9.674051308718404e+01 -4.909439128629754e+01 + 5 2.154364623476983e+02 -6.269983282181995e+01 -7.371790382393006e+01 -1.924766763242052e+02 + ME 3.932629496609272e-07 + +Event 170 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.175951285792084e+02 -5.046759251961064e+02 2.162362154945924e+02 -2.827858774422785e+02 + 3 8.257615191587736e+01 -1.244558951936122e+01 -3.459182778741835e+01 7.394142017215529e+01 + 4 3.412788931358452e+02 9.206397792120845e+01 -3.135461028995509e+02 9.840908609399399e+01 + 5 4.585498263690690e+02 4.250575367942591e+02 1.319017151923769e+02 1.104353711761293e+02 + ME 2.452067700605293e-07 + +Event 171 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.241099768142415e+02 -1.801845393564228e+02 -4.755218322366496e+02 3.618366786304098e+02 + 3 1.760989498853843e+02 1.628509626135169e+02 2.215130304491187e+01 6.324337040353328e+01 + 4 1.106507541955660e+02 8.705766835783074e+01 6.684051231296792e+01 1.403202401243385e+01 + 5 5.891403191048080e+02 -6.972409161492480e+01 3.865300168787698e+02 -4.391120730463770e+02 + ME 2.519665006748556e-06 + +Event 172 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.170584705570328e+02 -2.734753423202163e+02 -1.216658779499090e+02 2.904245498384904e+02 + 3 2.310308870099136e+02 8.118388166822322e+01 2.158679150802150e+02 -1.361952087220693e+01 + 4 5.904124943192924e+02 2.597890264188085e+02 -2.749362883764528e+02 -4.533283716407744e+02 + 5 2.614981481137613e+02 -6.749756576681538e+01 1.807342512461468e+02 1.765233426744909e+02 + ME 1.101757796579511e-07 + +Event 173 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.257185513659660e+02 -1.853427295891294e+02 1.999009554138799e+02 -1.782701750630550e+02 + 3 5.945302616762999e+02 -1.868533588100201e+02 -3.653044045316943e+02 4.302380113147229e+02 + 4 4.859436879418532e+02 4.057526358399537e+02 7.788756376499921e+01 -2.558116372845700e+02 + 5 9.380749901588128e+01 -3.355654744080421e+01 8.751588535281510e+01 3.843801032902204e+00 + ME 4.039581462260049e-07 + +Event 174 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.305554368133527e+02 -4.672890155754842e+02 -2.496372833150609e+02 2.848287615003386e+01 + 3 3.299992352974425e+02 -6.928313446547062e+01 3.005180188229560e+02 -1.174234343658106e+02 + 4 4.859186117296837e+02 4.836387558179939e+02 -2.840874679686286e+00 -4.692952724224148e+01 + 5 1.535267161595213e+02 5.293339422296093e+01 -4.803986082820875e+01 1.358700854580182e+02 + ME 2.458608059161492e-08 + +Event 175 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.655594170959286e+02 -8.549977165527977e+00 3.601534549686833e+02 6.204896608409048e+01 + 3 5.484575297133065e+02 -1.135388551231450e+02 -5.289508375770935e+02 -9.014211947274862e+01 + 4 2.960714747917293e+02 2.867045434701770e+02 -5.821682656799006e+01 -4.549312083446880e+01 + 5 2.899115783990353e+02 -1.646157111815041e+02 2.270142091764003e+02 7.358627422312698e+01 + ME 7.465915231429066e-08 + +Event 176 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.687299918743938e+02 8.339274256771590e+01 3.315319220567166e+02 1.381811928306608e+02 + 3 1.629485641149831e+02 -9.683584082676978e+01 -1.409503471845349e+01 1.302934552262426e+02 + 4 4.822020608335144e+02 -2.484261719570100e+02 -3.907996987564750e+02 1.344576513654873e+02 + 5 4.861193831771092e+02 2.618692702160639e+02 7.336281141821193e+01 -4.029322994223907e+02 + ME 7.475547307992145e-08 + +Event 177 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.577057842620000e+02 2.915980668115857e+00 1.386867066994572e+02 -2.171860163171064e+02 + 3 7.376522012584236e+02 -1.434041695523210e+02 -3.410539372724913e+02 6.381600316905930e+02 + 4 6.808763621736034e+01 2.120489361288136e+01 5.915691106678145e+01 -2.620569719681001e+01 + 5 4.365543782622163e+02 1.192832952713237e+02 1.432103195062527e+02 -3.947683181766764e+02 + ME 1.034453123793404e-04 + +Event 178 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.161481411810116e+02 5.961255761930873e+02 3.994627094820481e+01 1.505892583373998e+02 + 3 4.209349270977737e+02 -3.821919352609090e+02 8.968205956028467e+01 -1.518968915592243e+02 + 4 3.373669481877312e+02 -2.683700810084368e+02 -1.756017833436886e+02 1.046803278345296e+02 + 5 1.255499835334836e+02 5.443644007625857e+01 4.597345283519908e+01 -1.033726946127052e+02 + ME 4.921930840696330e-08 + +Event 179 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.043080164604084e+02 -4.190938142816105e+02 2.545150292529335e+02 1.179366362989085e+02 + 3 3.183281589704841e+02 3.585230981201615e+01 1.721259596521259e+02 -2.653678252699747e+02 + 4 2.314375836919170e+02 2.283484903133223e+02 -2.615876940174820e+01 2.713007372208610e+01 + 5 4.459262408771900e+02 1.548930141562721e+02 -4.004822195033112e+02 1.203011152489801e+02 + ME 2.475648278180600e-08 + +Event 180 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.882297939654284e+02 -6.668050208816671e+02 1.625262598210258e+02 5.115201166409270e+01 + 3 1.329650151441869e+02 1.107437070332806e+02 -6.736705495672317e+01 2.961767227988411e+01 + 4 4.738727920676824e+02 4.060266452073242e+02 -2.342858705864515e+02 -6.933914685987033e+01 + 5 2.049323988227020e+02 1.500346686410624e+02 1.391266657221484e+02 -1.143053708410681e+01 + ME 1.234528840839614e-06 + +Event 181 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.428334272971977e+02 1.994988487176103e+02 -8.712095886553354e+01 1.076021435641570e+02 + 3 5.756669394886244e+02 -4.225670293516123e+02 3.014220455272963e+02 2.489463424019197e+02 + 4 2.315484652385182e+02 -9.345090162870810e+01 6.540105188821647e+01 -2.015051442296440e+02 + 5 4.499511679756595e+02 3.165190822627101e+02 -2.797021385499791e+02 -1.550433417364326e+02 + ME 7.867586943237503e-08 + +Event 182 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.444435281163277e+02 1.469003270004890e+01 -4.049746300418324e+02 -1.825097302782552e+02 + 3 5.448711840887840e+02 -2.922556364469128e+02 4.166227684668842e+02 -1.946707965009441e+02 + 4 3.116991105767848e+02 1.944903403636982e+02 -1.026079273884968e+02 2.209105164463849e+02 + 5 1.989861772181040e+02 8.307526338316568e+01 9.095978896344508e+01 1.562700103328142e+02 + ME 2.832811162875883e-07 + +Event 183 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.401094905126758e+02 -1.821299783062261e+02 1.426447704035357e+02 6.429391816457414e+01 + 3 5.112023061484729e+02 4.864809735593811e+02 1.476911503742452e+02 5.339835463163436e+01 + 4 1.651868611336106e+02 5.963144449802130e+01 1.535106867308423e+02 -1.285530931869607e+01 + 5 5.835013422052410e+02 -3.639824397511763e+02 -4.438466075086230e+02 -1.048369634775124e+02 + ME 3.264580050034136e-08 + +Event 184 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.826515649016759e+02 -1.477902646129266e+02 1.867155418966292e+02 -1.522736065155598e+02 + 3 6.027880405940905e+02 1.749283586593595e+01 -3.719329937924915e+02 4.740393134595325e+02 + 4 2.613827332123843e+02 -1.367477909345774e+02 -2.306183173214314e+01 -2.215606617006907e+02 + 5 3.531776612918492e+02 2.670452196815680e+02 2.082792836280054e+02 -1.002050452432820e+02 + ME 3.276509027347192e-08 + +Event 185 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.293902171331328e+02 2.274132505448970e+01 4.856616106115517e+02 -2.094507916175660e+02 + 3 3.105838365977013e+02 -2.358986821094530e+02 -1.967102753115335e+02 4.603475775576092e+01 + 4 1.784474258301200e+02 3.764653629262911e+01 6.082173661233958e+00 -1.743250677731650e+02 + 5 4.815785204390459e+02 1.755108207623340e+02 -2.950335089612522e+02 3.377411016349700e+02 + ME 2.003022644086753e-08 + +Event 186 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.989720483234584e+02 -1.741182591155330e+02 -6.550171034983227e+01 7.058494030496487e+01 + 3 5.152114262802828e+02 -4.648498795531517e+02 -1.436720305668071e+02 1.694572243428830e+02 + 4 5.546843843841009e+02 4.191879013201379e+02 1.907167584026883e+02 -3.091656315484595e+02 + 5 2.311321410121571e+02 2.197802373485464e+02 1.845698251395178e+01 6.912346690061166e+01 + ME 2.477617615966958e-03 + +Event 187 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.129865985452144e+02 8.791089187031965e+01 -7.377241052563376e+01 -3.967206033391113e+02 + 3 6.138096947611981e+02 2.827115017631438e+02 3.326539175498887e+02 4.314833939937611e+02 + 4 1.561121885082346e+01 -1.526480341131205e+00 1.058464960658816e+01 -1.137300331848183e+01 + 5 4.575924878427639e+02 -3.690959132923323e+02 -2.694661566308431e+02 -2.338978733616801e+01 + ME 4.960920496803351e-06 + +Event 188 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.409186700623274e+02 2.554881275349464e+02 3.574186001581244e+02 -3.724291287245739e+01 + 3 5.678467400131306e+02 -2.085482205222034e+02 -5.142972510897583e+02 -1.202326801750690e+02 + 4 3.251230260129323e+02 -1.394598252065705e+02 2.786827561471973e+02 9.269228998876032e+01 + 5 1.661115639116090e+02 9.251991819382761e+01 -1.218041052155634e+02 6.478330305876631e+01 + ME 9.764924485701689e-09 + +Event 189 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.048089636454499e+02 -1.135472313170012e+02 1.696480662060317e+02 -1.653092502918516e+01 + 3 5.473280721360719e+02 8.276565856454246e+01 -4.677176099525417e+02 -2.719523885735314e+02 + 4 5.707624195781957e+02 1.366705484110403e+02 3.757862824583083e+02 4.072782472904075e+02 + 5 1.771005446402819e+02 -1.058889756585815e+02 -7.771673871179847e+01 -1.187949336876910e+02 + ME 3.275771424816761e-08 + +Event 190 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.691905082890465e+02 1.091304876593432e+02 -1.415440939324063e+02 -3.230440179709455e+02 + 3 5.165238126843833e+02 6.541410712031542e+01 5.564220852798429e+00 5.123347373602195e+02 + 4 4.306101857863021e+02 -3.416491223880156e+02 2.067968524389392e+02 -1.610467978438568e+02 + 5 1.836754932402679e+02 1.671045276083568e+02 -7.081697935933126e+01 -2.824392154541714e+01 + ME 2.515606677569527e-07 + +Event 191 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.069337892507111e+02 -3.717000808244267e+02 -1.559978392326048e+02 -5.566716180851297e+01 + 3 5.558199679384032e+02 4.288254322302435e+02 -3.467374488240874e+02 6.940984810929830e+01 + 4 1.131764079543421e+02 2.092547930706861e+01 8.924670515456381e+01 -6.637807810016390e+01 + 5 4.240698348565433e+02 -7.805083071288540e+01 4.134885829021284e+02 5.263539179937848e+01 + ME 5.965651857802165e-08 + +Event 192 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.660270538535899e+02 -6.938500902735627e+00 3.406931051238016e+02 -1.336258563267746e+02 + 3 2.198197890933955e+02 1.850047155697348e+02 -9.607133768696318e+01 -6.974448343104864e+01 + 4 4.132582096760702e+02 -1.250851363991760e+02 -2.616302956397871e+02 -2.944242600739754e+02 + 5 5.008949473769451e+02 -5.298107826782336e+01 1.700852820294868e+01 4.977945998317989e+02 + ME 7.575537943814201e-08 + +Event 193 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.330325958487632e+02 5.870041499782552e+01 1.287754330113803e+02 1.851360036833128e+02 + 3 5.708908650052944e+02 1.115253094670216e+02 4.567035139831037e+02 -3.238832897978265e+02 + 4 1.796971979479947e+02 7.338418865332758e+01 -1.320667158645153e+02 9.728425549293682e+01 + 5 5.163793411979480e+02 -2.436099131181745e+02 -4.534122311299690e+02 4.146303062157691e+01 + ME 3.460292919730256e-08 + +Event 194 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.026272994324191e+02 -2.281452380970013e+02 1.481292949894225e+02 4.226602233879194e+02 + 3 3.230851168166026e+02 6.671920747475995e+01 -1.604438231736436e+02 -2.723790000491271e+02 + 4 4.149226527531526e+02 1.112824714324781e+02 1.170540097886191e+02 -3.821980875305355e+02 + 5 2.593649309978254e+02 5.014355918976329e+01 -1.047394816043981e+02 2.319168641917431e+02 + ME 2.749366591170945e-05 + +Event 195 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.186769531653990e+02 4.053399161407182e+02 -4.064715764973513e+01 -3.210500626116971e+02 + 3 1.983392180439615e+02 1.362622940759870e+02 -2.970600443234310e+01 1.410268978886827e+02 + 4 4.580369644375026e+02 -4.270735786572081e+02 -1.650789232562077e+02 -1.244862645509944e+01 + 5 3.249468643531377e+02 -1.145286315594970e+02 2.354320853382861e+02 1.924717911781138e+02 + ME 1.170680237886963e-08 + +Event 196 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.670680736096177e+02 -5.196009607058459e+02 -3.195159447878110e+02 -2.700078100521552e+02 + 3 4.796337064789745e+02 4.109757184160675e+02 1.611950782145398e+02 1.875195937030707e+02 + 4 2.549467832538206e+02 1.757938664861856e+02 1.180485581477291e+02 1.419820999864066e+02 + 5 9.835143665758714e+01 -6.716862419640717e+01 4.027230842554202e+01 -5.949388363732206e+01 + ME 2.979904205282766e-05 + +Event 197 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.456277846903779e+02 5.073638365313777e+02 -6.950772545675264e+01 -1.883090356518230e+02 + 3 3.840060774912939e+02 -2.335726483981930e+02 2.994458700181055e+02 -5.689161976603956e+01 + 4 2.170201197002058e+02 5.304772290063384e+01 -1.145548521448566e+02 1.765243816015771e+02 + 5 3.533460181181225e+02 -3.268389110338186e+02 -1.153832924164962e+02 6.867627381628544e+01 + ME 1.605725614438948e-08 + +Event 198 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.910248171598183e+02 -4.090729546193913e+02 3.366364194938390e+02 -2.620030025937223e+02 + 3 5.547864644728130e+02 3.349061746249842e+02 -1.441441520604849e+02 4.181487041556714e+02 + 4 2.745197367303354e+02 5.772758735139705e+01 -1.507960221697820e+02 -2.220116465657828e+02 + 5 7.966898163703360e+01 1.643919264301007e+01 -4.169624526357205e+01 6.586594500383369e+01 + ME 6.790835194296702e-08 + +Event 199 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.089763723582333e+02 -2.391470875959899e+02 -7.198191922923307e+01 -1.819166635415397e+02 + 3 2.302930034245914e+02 2.280759743109415e+02 2.554139852644604e+01 1.907496604838036e+01 + 4 4.296042628313223e+02 2.011197297410303e+02 -3.594007704411145e+02 -1.222365050141414e+02 + 5 5.311263613858528e+02 -1.900486164559819e+02 4.058412911439016e+02 2.850782025073007e+02 + ME 1.175867940427483e-08 + +Event 200 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.668574923646055e+02 1.957149396867698e+02 -4.074080523547823e+01 -1.767732174377509e+02 + 3 4.828614932981504e+02 -4.435255570781281e+02 2.718101129892900e+01 -1.889483912432794e+02 + 4 4.041444758692146e+02 3.452769428205599e+02 -1.849911387865645e+02 9.947295462858226e+01 + 5 3.461365384680295e+02 -9.746632542920156e+01 1.985509327231138e+02 2.662486540524481e+02 + ME 2.749175020762736e-08 + +Event 201 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.557462218499875e+02 5.577307285874684e+01 -5.518883540368641e+01 3.469854912690419e+02 + 3 6.090180126474650e+02 -2.732890171778827e+02 3.408295323239707e+02 -4.243245016668710e+02 + 4 1.810043524304991e+02 1.725577415117764e+01 -1.199573841645603e+01 1.797801883330636e+02 + 5 3.542314130720480e+02 2.002601701679582e+02 -2.736449585038284e+02 -1.024411779352346e+02 + ME 3.504514518357024e-08 + +Event 202 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.401898284303757e+02 -4.345767112988091e+02 6.008776977239310e+01 3.605033944437847e+01 + 3 5.108999584385473e+02 4.765299298274664e+02 1.140292958983962e+02 1.446904046163914e+02 + 4 1.528255459328671e+02 1.932347971289792e+01 4.151372197535512e+01 1.458041889289091e+02 + 5 3.960846671982105e+02 -6.127669824155523e+01 -2.156307876461443e+02 -3.265449329896788e+02 + ME 6.813371080726688e-08 + +Event 203 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.041261761344619e+02 -9.500166915393221e+01 3.778895461862847e+01 1.972151929195670e+01 + 3 4.443475859153141e+02 -3.287098418714518e+02 1.564021836721695e+02 -2.548194927947812e+02 + 4 2.899092144312161e+02 4.077089716485121e+01 2.868817947747053e+02 -9.160916075474477e+00 + 5 6.616170235190079e+02 3.829406138605328e+02 -4.810729330655033e+02 2.442588895782990e+02 + ME 1.665804935067878e-07 + +Event 204 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.353583912015085e+02 3.684328216913379e+02 2.309035724253135e+02 -2.185691953026546e+01 + 3 2.535164374831818e+02 -2.445756557044239e+02 4.172545258629646e+01 -5.207993200248917e+01 + 4 3.377617009372445e+02 2.305821856525062e+02 -1.063652282272364e+01 2.465799802538221e+02 + 5 4.733634703780651e+02 -3.544393516394204e+02 -2.619925021888862e+02 -1.726431287210675e+02 + ME 6.710104132622880e-09 + +Event 205 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.561765973099245e+02 -5.595525752729490e+02 -2.445909106268453e+02 -2.400915008936639e+02 + 3 4.436268818857168e+02 4.246930749258181e+02 -8.966125936745796e+01 9.165948401741433e+01 + 4 1.399697315148318e+02 1.499554105305907e+01 1.358590097348833e+02 3.014944382102273e+01 + 5 2.602267892895270e+02 1.198639592940718e+02 1.983931602594199e+02 1.182825730552269e+02 + ME 9.509730500940703e-07 + +Event 206 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.492493811814724e+01 -2.674402956441454e+00 5.513252874300673e+01 -3.418332097490475e+01 + 3 4.416704097344152e+02 1.072044637050194e+02 4.084318907704086e+02 1.294733347002166e+02 + 4 7.006249631956902e+02 7.376574991388429e+01 -6.937409068373729e+02 -6.447873582091822e+01 + 5 2.927796889517476e+02 -1.782958106624622e+02 2.301764873239575e+02 -3.081127790439360e+01 + ME 1.146704939046614e-06 + +Event 207 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.871274697603226e+02 4.279496817610275e+02 2.164744983621550e+02 -8.538754730394098e+01 + 3 8.117859920617880e+01 -3.275578473866843e+01 6.568413093248336e+01 -3.467879004342295e+01 + 4 2.872554221624426e+02 2.156852159581689e+02 -8.320290545444449e+01 1.705076001318999e+02 + 5 6.444385088710566e+02 -6.108791129805279e+02 -1.989557238401938e+02 -5.044126278453623e+01 + ME 5.041753184810510e-08 + +Event 208 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.641639267419051e+02 -3.419606422698960e+02 -3.074769342805806e+02 6.300003898602273e+01 + 3 1.786897064305950e+02 1.076490072974622e+02 -9.081241090814456e+01 1.099763994553988e+02 + 4 5.763848573443232e+02 4.797982260748441e+02 2.623521952599240e+02 -1.821661101126315e+02 + 5 2.807615094831767e+02 -2.454865911024102e+02 1.359371499288009e+02 9.189671671210021e+00 + ME 1.953475320961929e-08 + +Event 209 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.385828658973287e+02 -5.075491230008665e+02 -3.533516944878140e+02 -1.591368730410633e+02 + 3 1.765423491478754e+02 1.621867223420716e+02 2.683860309775546e+01 6.436114916894576e+01 + 4 4.986667957256043e+02 4.344440716974440e+02 1.795281780354134e+02 1.664228200021638e+02 + 5 1.862079892291917e+02 -8.908167103864919e+01 1.469849133546452e+02 -7.164709613004628e+01 + ME 1.140886553579000e-05 + +Event 210 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.764638473920721e+02 -4.170352276148382e+02 2.008387828191197e+02 1.129743338800030e+02 + 3 7.223003052451869e+02 5.315103137685568e+02 -4.106342874200929e+02 -2.656953129242833e+02 + 4 1.470953339362244e+02 -9.260579603966968e+01 1.071066173964053e+02 3.986698277482917e+01 + 5 1.541405134265168e+02 -2.186929011404899e+01 1.026888872045679e+02 1.128539962694512e+02 + ME 1.505587117575617e-06 + +Event 211 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.375648452949695e+02 -2.432158852065867e+02 -3.617678147661081e+02 -3.785597998390855e+01 + 3 3.755078213314551e+02 2.872422996521213e+02 2.203609879312930e+02 -9.969463460789410e+01 + 4 3.007064557415487e+02 -1.694459342837558e+02 -2.228285208135228e+02 1.098175677722218e+02 + 5 3.862208776320262e+02 1.254195198382212e+02 3.642353476483379e+02 2.773304681958095e+01 + ME 3.654780827158940e-09 + +Event 212 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.121878142369525e+02 1.183445947348010e+00 -2.465625978764644e+01 1.094384483270791e+02 + 3 7.375067391765947e+02 -3.127794647526267e+02 3.331246995416566e+02 -5.788895674610445e+02 + 4 2.291493811726511e+01 1.618073814178101e+01 -1.621519190684619e+01 -5.879230641442963e-01 + 5 6.273905084691874e+02 2.954152806634978e+02 -2.922532478471640e+02 4.700390421981098e+02 + ME 2.975776227583000e-04 + +Event 213 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.149170341818730e+02 2.012777647785824e+00 4.394135149382410e+02 2.684236645797402e+02 + 3 2.443426884296023e+02 -1.176956726368923e+02 1.176742395059145e+02 1.788962028335422e+02 + 4 9.102154734540720e+01 -3.403376741680239e+01 -5.303122632979904e+01 -6.568343619611288e+01 + 5 6.497187300431175e+02 1.497166624059089e+02 -5.040565281143565e+02 -3.816364312171696e+02 + ME 8.306823007636184e-06 + +Event 214 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.202063301686816e+02 -1.064350067374717e+02 1.642139056432389e+02 2.534511322016189e+02 + 3 8.302321932066316e+01 -5.764493522725375e+01 5.652661110437804e+01 -1.935610048710902e+01 + 4 5.674025885458663e+02 2.435984979474369e+02 2.596369945692085e+02 -4.418077640075727e+02 + 5 5.293678619647887e+02 -7.951855598271139e+01 -4.803775113168255e+02 2.077127322930627e+02 + ME 5.363501456781809e-07 + +Event 215 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.138906695630311e+02 1.178864239647872e+02 -4.582866998625273e+02 2.003988823268451e+02 + 3 3.856670962690674e+02 -3.121084563031182e+01 3.362323895090096e+02 -1.863136401467070e+02 + 4 2.798710520603011e+02 -1.660050298897437e+02 -1.022090328473867e+02 2.008069954883218e+02 + 5 3.205711821075997e+02 7.932945155526801e+01 2.242633432009039e+02 -2.148922376684599e+02 + ME 3.992433658497868e-09 + +Event 216 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.502734568939956e+02 -3.484492676068055e+02 -3.562535325858312e+02 -2.333825310224929e+02 + 3 2.927316415798877e+02 -1.723462782434523e+01 1.859956838442979e+02 2.253894123003654e+02 + 4 4.062892609764483e+02 2.518503039165275e+02 2.467607920511911e+02 -2.018700064601230e+02 + 5 2.507056405496688e+02 1.138335915146231e+02 -7.650294330965788e+01 2.098631251822504e+02 + ME 7.801404292044693e-09 + +Event 217 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.740538118662427e+02 4.049475823483304e+02 4.864798667202038e-01 2.464634565863596e+02 + 3 1.985267271768090e+02 -1.272681407062418e+02 -1.303933537284555e+02 -7.882420356817597e+01 + 4 5.247611246848360e+02 -4.998692904349177e+02 1.555305125498372e+02 3.626555013467214e+01 + 5 3.026583362721133e+02 2.221898487928292e+02 -2.562363868810232e+01 -2.039048031528558e+02 + ME 1.857260363869186e-08 + +Event 218 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.734261191295302e+02 1.952249633514983e+02 1.466545247967154e+02 2.825362463281442e+02 + 3 1.398002662411877e+02 -9.684800712809063e+00 -1.064184732605320e+02 -9.014115389297820e+01 + 4 5.708039251040939e+02 -3.493876776423592e+02 -4.213571589346009e+02 -1.618750019010977e+02 + 5 4.159696895251884e+02 1.638475150036699e+02 3.811211073984175e+02 -3.052009053406822e+01 + ME 2.933027765980187e-07 + +Event 219 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.964338295601744e+02 -4.930977654258189e+02 4.159874786449745e+00 5.730476686633798e+01 + 3 4.573553743327014e+02 2.961037518728453e+02 1.328781681642411e+02 3.222419882373264e+02 + 4 2.353157927788248e+02 2.103198091366726e+02 -7.380868571206497e+01 -7.544122301035414e+01 + 5 3.108950033283002e+02 -1.332579558369914e+01 -6.322935723862584e+01 -3.041055320933104e+02 + ME 1.605645140699896e-07 + +Event 220 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.895106323190152e+02 -9.842902929784732e+01 -3.242676826039605e+02 -1.920435599016529e+02 + 3 2.689406576306271e+02 6.079057874307623e+01 2.545565461740099e+02 6.192372455557899e+01 + 4 4.581215795927168e+02 4.170329892022734e+02 5.148915397769731e+01 1.825040674292038e+02 + 5 3.834271304576407e+02 -3.793945386475023e+02 1.822198245225329e+01 -5.238423208312999e+01 + ME 1.600817797406408e-08 + +Event 221 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.276347617265197e+02 4.709149142425277e+02 -1.408991586477921e+02 1.917941928695926e+02 + 3 3.954418917787426e+02 -3.935231131055337e+02 -3.463013336995996e+01 1.773705409753067e+01 + 4 4.144133623840756e+02 -1.412634588767815e+02 1.548014912649188e+02 -3.575186266627241e+02 + 5 1.625099841106623e+02 6.387165773978764e+01 2.072780075283326e+01 1.479873796956010e+02 + ME 1.257555614821603e-07 + +Event 222 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.584442989006753e+02 -1.078486037163722e+02 -3.673694489405860e+01 -2.319752814214078e+02 + 3 4.134394487074567e+02 2.855923313448867e+02 1.924609957241842e+02 -2.287530615081986e+02 + 4 2.278563169252061e+02 -1.871641106410820e+02 1.009921504714390e+02 -8.178436521614033e+01 + 5 6.002599354666619e+02 9.420383012567585e+00 -2.567162013015646e+02 5.425127081457468e+02 + ME 1.910125239441710e-07 + +Event 223 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.884176534843068e+02 -2.685843465024753e+02 7.606026273343657e+01 7.253983793855950e+01 + 3 4.513000265569298e+02 1.572877532749310e+02 -3.610427145398818e+02 -2.204096978757593e+02 + 4 3.276144969444981e+02 -5.317464615934442e+01 2.105167263568906e+02 -2.453292146035672e+02 + 5 4.326678230142647e+02 1.644712393868885e+02 7.446572544955464e+01 3.931990745407668e+02 + ME 1.483209837516259e-07 + +Event 224 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.199291946379054e+02 2.536541992327828e+02 -3.336793713310643e+02 -2.565449022922253e+01 + 3 5.882329265931313e+02 -4.073421144314938e+02 3.584669694021168e+02 2.271383049739999e+02 + 4 1.482351543087514e+02 -7.006196037154501e+01 1.303822701627548e+02 8.089889309615467e+00 + 5 3.436027244602121e+02 2.237498755702561e+02 -1.551698682338073e+02 -2.095737040543929e+02 + ME 2.838703884486094e-06 + +Event 225 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.828966757595687e+02 5.914771327429723e+01 -6.614934231762892e+01 2.686193248468568e+02 + 3 6.901623261275527e+02 3.466820540113055e+02 9.413404853472807e+01 -5.892998988110877e+02 + 4 4.105963527227361e+02 -3.014014625423418e+02 -4.993418597615529e+01 2.743229853975957e+02 + 5 1.163446453901425e+02 -1.044283047432609e+02 2.194947975905614e+01 4.635758856663511e+01 + ME 4.172992284154855e-06 + +Event 226 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.360810731792063e+02 1.642782223699453e+02 -1.678975564323074e+02 -2.360824670484281e+01 + 3 5.160979991978127e+02 2.441030829231631e+01 -3.180769822102765e+02 -4.056948545558138e+02 + 4 5.308615635413039e+02 -1.515741961800045e+02 2.722138494694173e+02 4.298126136526764e+02 + 5 2.169593640816770e+02 -3.711433448225709e+01 2.137606891731667e+02 -5.095123920196887e-01 + ME 1.428859626748661e-07 + +Event 227 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.616469781392049e+02 1.350377972296386e+02 2.133665597436084e+02 6.854666783966793e+01 + 3 5.923326784759448e+02 -2.698941073591661e+02 1.106177983815398e+02 -5.155374627362274e+02 + 4 4.172276769301888e+02 2.330765525795236e+02 -1.925693830312472e+02 2.875261514248249e+02 + 5 2.287926664546615e+02 -9.822024244999602e+01 -1.314149750939009e+02 1.594646434717345e+02 + ME 1.287134977020958e-07 + +Event 228 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.092591846574759e+02 -3.142862365084946e+02 -4.911232501198606e+01 3.976896506463815e+02 + 3 2.735255741033088e+02 -2.718229287466501e+02 2.970718554764163e+01 6.783673265737477e+00 + 4 4.133409721086167e+02 3.274757910496356e+02 -2.426316191218202e+01 -2.510411609161631e+02 + 5 3.038742691305988e+02 2.586333742055091e+02 4.366830137652649e+01 -1.534321629959558e+02 + ME 1.297196468121015e-05 + +Event 229 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.602909302006853e+02 4.125987937663707e+01 1.420578335932218e+02 2.141694717119223e+02 + 3 5.291571012132248e+02 -2.192354150045460e+02 4.669770596370902e+02 -1.177942967416716e+02 + 4 3.718875129217348e+02 1.186191445924093e+02 -2.915140032044291e+02 -1.981146303002550e+02 + 5 3.386644556643550e+02 5.935639103549980e+01 -3.175208900258829e+02 1.017394553300042e+02 + ME 5.230842299336452e-08 + +Event 230 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.332952183623519e+01 2.203444947169411e+01 3.375417535460226e+01 -6.125648908761432e+01 + 3 6.172505402697790e+02 -4.734101162373001e+02 -6.693569541003697e+01 -3.903853275771192e+02 + 4 7.073803814485553e+02 4.648032214155373e+02 1.050900954312582e+02 5.227820207909715e+02 + 5 1.020395564454305e+02 -1.342755464993128e+01 -7.190857537582355e+01 -7.114020412623796e+01 + ME 1.327614562653582e-05 + +Event 231 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.515175972108462e+02 1.138815578269133e+02 9.437222137628117e+01 4.266063937758993e+02 + 3 4.345680215032115e+02 2.353831600015865e+02 2.161520441325732e+02 -2.944867180675445e+02 + 4 5.497447759881504e+02 -3.929841166309689e+02 -3.541217261697238e+02 -1.496014901550582e+02 + 5 6.416960529779162e+01 4.371939880246914e+01 4.359746066086947e+01 1.748181444670339e+01 + ME 5.788208996244478e-06 + +Event 232 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.827704738940410e+02 2.367350579269709e+02 2.188417272414405e+02 -2.063444849065888e+02 + 3 4.081166106293834e+02 2.895218841081624e+02 -1.566771248798238e+02 2.412229778306114e+02 + 4 2.051848745083037e+02 -3.942539020633504e+01 -1.720622366528163e+02 -1.045995126761917e+02 + 5 5.039280409682721e+02 -4.868315518287982e+02 1.098976342911997e+02 6.972101975216928e+01 + ME 7.961613388414726e-09 + +Event 233 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.943080306229409e+02 -1.702895836853463e+02 2.339619717705404e+02 5.367001344015300e+01 + 3 2.122083142716506e+02 -1.780325885917701e+02 -1.129102431248863e+02 -2.424959881746904e+01 + 4 3.867932102720350e+02 -3.902291549294988e+01 2.871044371014024e+02 -2.562366909239652e+02 + 5 6.066904448333735e+02 3.873450877700662e+02 -4.081561657470563e+02 2.268162763012812e+02 + ME 1.745973213090231e-08 + +Event 234 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.029213520218470e+02 5.481883220516290e+01 1.866500429479581e+02 2.322025034618088e+02 + 3 5.290563168779679e+02 5.680066435642220e+01 3.079339449882755e+02 -4.264398626783236e+02 + 4 5.645687672755981e+02 -1.284160013176555e+02 -5.400643958490008e+02 1.028478095290611e+02 + 5 1.034535638245874e+02 1.679650475607037e+01 4.548040791276728e+01 9.138954968745372e+01 + ME 2.981349591219886e-04 + +Event 235 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.445049250783110e+02 -2.589734140921205e+01 -1.319246578968199e+02 3.171889905875589e+02 + 3 4.823315859421515e+02 1.472115336462560e+02 3.873128963179109e+02 2.469033080070407e+02 + 4 3.973986256682689e+01 3.274912805667769e+01 1.780785140860165e+01 -1.377068322803123e+01 + 5 6.334236264127101e+02 -1.540633202937217e+02 -2.731960898296927e+02 -5.503216153665684e+02 + ME 7.627384534210604e-06 + +Event 236 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.946542617367739e+02 -3.810657402206150e+02 2.660800071441579e+02 -5.162701567758322e+02 + 3 3.916436838147844e+02 3.070607591843182e+02 -1.609888059458169e+02 1.821567171457177e+02 + 4 9.227402093021708e+01 -2.267558677586709e+01 -8.487016008637363e+01 2.823771644296984e+01 + 5 3.214280335182251e+02 9.668056781216420e+01 -2.022104111196719e+01 3.058757231871448e+02 + ME 1.300010341104758e-06 + +Event 237 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.545864517205333e+02 -1.684333917230649e+02 1.738982858125821e+02 7.876445990268432e+01 + 3 2.329362064168683e+01 2.196496253262233e+01 7.299389178887915e+00 2.617651839807976e+00 + 4 6.044417478472913e+02 9.016712624747304e+01 2.020796472547224e+02 -5.624798059045031e+02 + 5 6.176781797904888e+02 5.630130294296941e+01 -3.832773222461925e+02 4.810976941620108e+02 + ME 3.741857901018684e-05 + +Event 238 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.734827785011451e+02 6.504306401774852e+01 3.141570859505670e+02 -3.482250230919594e+02 + 3 3.980459969418916e+02 -3.957723037213678e+02 -3.888755659178172e+01 -1.710722744534840e+01 + 4 4.526639311890891e+02 2.293268680998931e+02 -3.168457877546506e+02 2.278652429599335e+02 + 5 1.758072933678743e+02 1.014023716037262e+02 4.157625839586528e+01 1.374670075773743e+02 + ME 6.638789183821311e-08 + +Event 239 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.971046063012477e+02 -8.428839093137762e+01 -3.781742721300474e+02 4.543276327929836e+02 + 3 2.887724106080997e+02 1.067916659556329e+02 -1.547148155349657e+01 -2.678538378843455e+02 + 4 3.710345357389203e+02 1.508541977394770e+02 3.380306102744051e+02 -2.539575265449119e+01 + 5 2.430884473517323e+02 -1.733574727637321e+02 5.561514340913884e+01 -1.610780422541469e+02 + ME 2.374910232515949e-08 + +Event 240 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.944633714894942e+02 1.680489118241840e+02 -4.922931942743070e+01 -2.367376490809594e+02 + 3 6.312361218089728e+02 -4.392109669761413e+02 -1.588390894238460e+02 4.246444532021097e+02 + 4 1.228701097365928e+02 3.824564875459875e+01 -3.798906962116549e+01 1.104136078902436e+02 + 5 4.514303969649403e+02 2.329164063973583e+02 2.460574784724422e+02 -2.983204120113940e+02 + ME 2.334754739636239e-06 + +Event 241 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.490816146825686e+02 -6.780857619464055e+01 -9.970859965235596e+01 8.766709751202950e+01 + 3 6.219622579644403e+02 5.689750001884105e+02 -2.464007541664535e+02 4.889956890429650e+01 + 4 4.738547977759255e+02 -4.341772833431141e+02 1.011389610234908e+02 -1.606218122396370e+02 + 5 2.551013295770660e+02 -6.698914065065584e+01 2.449703927953186e+02 2.405514582331103e+01 + ME 3.091784713549796e-08 + +Event 242 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.607824489443906e+02 5.914237010894078e+02 9.905219388542328e+01 -2.775610086253786e+02 + 3 3.990062854722138e+02 -2.311648369680887e+02 -1.390196856386275e+02 2.940108178295281e+02 + 4 1.863529685930984e+02 -1.278615554594403e+02 1.331766098585692e+02 2.535433149446559e+01 + 5 2.538582969902974e+02 -2.323973086618787e+02 -9.320911810536504e+01 -4.180414069861513e+01 + ME 4.896247561165527e-08 + +Event 243 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.196686647491888e+02 1.122774737725374e+01 -1.164726479644376e+02 2.507287952825762e+01 + 3 1.133151911771732e+02 -7.807701867250280e+01 -5.396068428009529e+01 6.190764297529687e+01 + 4 6.621421970941052e+02 -2.484088449740625e+02 1.240463476686637e+02 -6.011138316004096e+02 + 5 6.048739469795333e+02 3.152581162693116e+02 4.638698457586910e+01 5.141333090968553e+02 + ME 1.891164937385966e-06 + +Event 244 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.280935105172994e+02 1.583139798127287e+02 -1.973572348382067e+02 2.088831187445198e+02 + 3 5.103001436903666e+02 -4.402359718002467e+02 -2.577957536370686e+02 1.182688421340017e+01 + 4 3.470582738822527e+02 1.452007069270574e+02 1.749319995190819e+02 -2.622308062037477e+02 + 5 3.145480719100811e+02 1.367212850604606e+02 2.802209889561934e+02 4.152080324582771e+01 + ME 2.191738542446196e-08 + +Event 245 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.396911839925324e+02 3.289933370610810e+02 -7.114763674475310e+01 -4.573290322403688e+01 + 3 4.384857551822396e+02 -2.791071807415779e+02 3.066124832896743e+02 1.426804970805971e+02 + 4 4.176347759399600e+02 -3.519942713896021e+02 -2.140287152627982e+02 -6.863343227815285e+01 + 5 3.041882848852677e+02 3.021081150700990e+02 -2.143613128212307e+01 -2.831416157840738e+01 + ME 1.438570644151102e-05 + +Event 246 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.608229391410545e+02 -1.458217158240457e+02 -2.068702814137655e+02 -6.300253521961616e+01 + 3 3.337430353743154e+02 2.371946018027211e+02 1.095161066265306e+02 2.076760865527770e+02 + 4 5.261187567554393e+02 -3.789028184447150e+02 3.437750393676882e+02 -1.226879076856591e+02 + 5 3.793152687291911e+02 2.875299324660397e+02 -2.464208645804532e+02 -2.198564364750174e+01 + ME 7.007360085870299e-09 + +Event 247 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.364693619192627e+02 -4.606913761890455e+01 3.470682395859301e+00 -2.319123714881567e+02 + 3 6.707586181725082e+02 6.224312082991470e+02 1.264643842609763e+02 2.156461784995245e+02 + 4 8.441103857058101e+01 -7.843393409710342e+01 -2.748410014225534e+01 1.476365991161495e+01 + 5 5.083609813376478e+02 -4.979281365831390e+02 -1.024509665145806e+02 1.502533077017021e+00 + ME 9.301161506198163e-05 + +Event 248 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.738776112665326e+01 -3.800357331789279e+01 1.082310776995783e+01 -5.458662018236982e+01 + 3 2.731577122423594e+02 -1.023650088339821e+02 -1.963134715457747e+02 -1.599923798608693e+02 + 4 5.780931297050771e+02 4.359659916762919e+02 -3.151044766581966e+02 2.117415630051930e+02 + 5 5.813613969259109e+02 -2.955974095244171e+02 5.005948404340136e+02 2.837437038046062e+00 + ME 1.129628060059410e-06 + +Event 249 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.771449535203825e+02 7.193062052119420e+01 7.263463532627497e+01 5.680197997926654e+02 + 3 1.483275966380963e+02 1.459664561817996e+02 -2.113630427238556e+00 -2.627550495380916e+01 + 4 3.000364175298768e+02 -9.368804156371920e+01 2.167482964027108e+02 -1.851069386020958e+02 + 5 4.744910323116439e+02 -1.242090351392746e+02 -2.872693013017472e+02 -3.566373562367605e+02 + ME 6.089662569520889e-07 + +Event 250 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.070748515323194e+02 2.522952262611133e+02 -2.176867244366134e+02 -3.822100389398939e+02 + 3 1.532396868211022e+02 -1.290354637987217e+02 4.944135710293927e+00 8.250943110551943e+01 + 4 5.298941936163072e+02 -2.854713268542403e+02 -6.677262350798102e+00 4.463736015266297e+02 + 5 3.097912680302719e+02 1.622115643918489e+02 2.194198510771176e+02 -1.466729936922552e+02 + ME 5.253313376485789e-06 + +Event 251 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.182836311039667e+02 -9.488016677547935e+01 -3.342630543518728e+02 2.328672593937015e+02 + 3 5.257152316762974e+02 -2.134880180342400e+02 4.309672861440874e+02 2.122888815879800e+02 + 4 9.387018854625116e+01 6.019218689726186e+01 1.252636339050904e+01 -7.093379416359140e+01 + 5 4.621309486734849e+02 2.481759979124574e+02 -1.092305951827236e+02 -3.742223468180902e+02 + ME 1.777686127392766e-05 + +Event 252 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 9.365447326755803e+01 4.277947245944421e+00 6.388710156528008e+01 -6.834689300891627e+01 + 3 1.951509969640048e+02 7.461354319095608e+01 1.411336517620811e+01 -1.797708088426464e+02 + 4 6.319138287033419e+02 4.463349366878397e+02 7.119278093780922e+01 4.416240472853802e+02 + 5 5.792807010650952e+02 -5.252264271247402e+02 -1.491932476792975e+02 -1.935063454338177e+02 + ME 2.079993547874975e-05 + +Event 253 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.901525762082525e+01 -4.735464381405097e+00 -3.620220682176467e+01 4.636567944481126e+01 + 3 5.682395652067073e+02 -2.449465566088505e+02 7.457080728476194e+01 5.072835327237361e+02 + 4 4.471391262730152e+02 4.097222030514868e+02 5.483830234859549e+01 -1.704519731958408e+02 + 5 4.256060508994522e+02 -1.600401820612312e+02 -9.320690281159271e+01 -3.831972389727065e+02 + ME 2.531867470497024e-05 + +Event 254 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.548613132274681e+02 1.031167602621186e+02 3.787623878828553e+01 1.091528772998381e+02 + 3 6.185434438850463e+02 -1.082622339079235e+02 -7.722295375814298e+01 -6.040793789687802e+02 + 4 3.466331755597932e+02 4.347459990183822e+01 2.783088740441841e+02 2.020116040981309e+02 + 5 3.799620673276920e+02 -3.832912625603345e+01 -2.389621590743266e+02 2.929148975708113e+02 + ME 8.925840823464039e-07 + +Event 255 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.081680100869346e+02 5.338690546525503e+02 2.877400741676966e+02 -4.536309838913623e+01 + 3 2.634732913131461e+02 -1.846835573335170e+02 1.840484333905927e+02 3.789898484772504e+01 + 4 4.775197661603478e+02 -2.126456306214765e+02 -4.259270823234806e+02 -3.732403240319362e+01 + 5 1.508389324395720e+02 -1.365398666975568e+02 -4.586142523480856e+01 4.478814594460502e+01 + ME 3.772045237625828e-08 + From 22c4ed0202798f270ab9ab1ac09a2c1a4bbc7e6a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 19:18:37 +0100 Subject: [PATCH 94/96] [susy2] in CODEGEN, add reference test file for susy_gg_gogo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ./CODEGEN/generateAndCompare.sh susy_gg_gogo -c 'import model MSSM_SLHA2; generate g g > go go' HRDCOD=1 make -j CUDACPP_RUNTEST_DUMPEVENTS=1 ./runTest.exe cp ../../test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_gogo.txt ../../../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/ NB: this process builds only in HRDCOD=1... for HRDCOD=0 the build fails as follows ccache g++ -O3 -std=c++17 -I. -I../../src -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -fPIC -c CPPProcess.cc -o CPPProcess.o CPPProcess.cc: In member function ‘virtual void mg5amcCpu::CPPProcess::initProc(const string&)’: CPPProcess.cc:554:38: error: ‘class mg5amcCpu::Parameters_MSSM_SLHA2’ has no member named ‘mdl_bsmIndepParam’; did you mean ‘nBsmIndepParam’? 554 | memcpy( bsmIndepParam, m_pars->mdl_bsmIndepParam, Parameters_MSSM_SLHA2::nBsmIndepParam * sizeof( double ) ); | ^~~~~~~~~~~~~~~~~ | nBsmIndepParam --- .../dump_CPUTest.Sigma_MSSM_SLHA2_gg_gogo.txt | 3584 +++++++++++++++++ 1 file changed, 3584 insertions(+) create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_gogo.txt diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_gogo.txt b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_gogo.txt new file mode 100644 index 0000000000..1b7b4027a1 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gg_gogo.txt @@ -0,0 +1,3584 @@ +Event 0 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.849331413473453e+02 -3.138365726669761e+02 -3.490842674916367e+02 + 3 7.500000000000002e+02 -5.849331413473453e+02 3.138365726669762e+02 3.490842674916365e+02 + ME 7.797479428222504e+00 + +Event 1 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.810632950825981e+01 -7.201507372976420e+02 -2.038840274050557e+02 + 3 7.499999999999998e+02 -4.810632950825976e+01 7.201507372976420e+02 2.038840274050555e+02 + ME 7.084428682664780e+00 + +Event 2 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.648646621266247e+02 -9.844173672211548e+01 -3.328125681616955e+02 + 3 7.500000000000001e+02 6.648646621266247e+02 9.844173672211554e+01 3.328125681616955e+02 + ME 7.690151496253704e+00 + +Event 3 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.840443703260545e+02 2.880181894591821e+02 -6.315570585677353e+02 + 3 7.500000000000000e+02 -2.840443703260545e+02 -2.880181894591820e+02 6.315570585677352e+02 + ME 1.150017927624436e+01 + +Event 4 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.068110730975250e+02 -7.417834499166065e+02 2.913259503670260e+01 + 3 7.500000000000009e+02 -1.068110730975249e+02 7.417834499166063e+02 -2.913259503670238e+01 + ME 6.707503737924870e+00 + +Event 5 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 7.488894260747408e+02 3.183002578419756e+01 2.552404693662126e+01 + 3 7.500000000000002e+02 -7.488894260747409e+02 -3.183002578419794e+01 -2.552404693662112e+01 + ME 6.708345362936475e+00 + +Event 6 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 1.342046371107456e+02 6.997714234797988e+02 2.341133705259677e+02 + 3 7.500000000000003e+02 -1.342046371107461e+02 -6.997714234797993e+02 -2.341133705259674e+02 + ME 6.909107059676345e+00 + +Event 7 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 7.437674050373497e+02 -9.311360962031361e+01 -2.529630224920386e+01 + 3 7.500000000000002e+02 -7.437674050373497e+02 9.311360962031345e+01 2.529630224920369e+01 + ME 6.735308973179285e+00 + +Event 8 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 5.195549862822020e+02 4.141972083174201e+02 3.478552699778414e+02 + 3 7.500000000000002e+02 -5.195549862822021e+02 -4.141972083174201e+02 -3.478552699778413e+02 + ME 7.295305201905381e+00 + +Event 9 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -5.542195584407183e+02 -3.922814044239861e+02 -3.185215546629774e+02 + 3 7.499999999999999e+02 5.542195584407183e+02 3.922814044239861e+02 3.185215546629775e+02 + ME 7.602474650757721e+00 + +Event 10 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.763918745729663e+01 4.519082493535349e+02 -5.983040976226634e+02 + 3 7.500000000000000e+02 1.763918745729668e+01 -4.519082493535349e+02 5.983040976226634e+02 + ME 1.085263851277656e+01 + +Event 11 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.439620635476698e+02 -5.926551785630559e+02 3.895412056149584e+02 + 3 7.500000000000001e+02 -2.439620635476698e+02 5.926551785630560e+02 -3.895412056149584e+02 + ME 7.507616203886550e+00 + +Event 12 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.246322345811119e+02 -1.588382746342976e+02 -6.977182538090113e+02 + 3 7.500000000000003e+02 -2.246322345811118e+02 1.588382746342976e+02 6.977182538090113e+02 + ME 1.281508082685293e+01 + +Event 13 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.323144312359151e+02 1.592517083468645e+02 -7.208548984888162e+02 + 3 7.500000000000003e+02 -1.323144312359151e+02 -1.592517083468643e+02 7.208548984888162e+02 + ME 1.318280949876704e+01 + +Event 14 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -7.006663333078419e+02 -2.674479229498733e+02 -6.188527994805768e+00 + 3 7.500000000000000e+02 7.006663333078420e+02 2.674479229498733e+02 6.188527994805669e+00 + ME 6.721838142681638e+00 + +Event 15 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -4.958624135651977e+02 -4.159237215561344e+02 3.789827498187641e+02 + 3 7.499999999999994e+02 4.958624135651974e+02 4.159237215561346e+02 -3.789827498187632e+02 + ME 7.449642261773644e+00 + +Event 16 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.469644380816109e+02 -1.127460289287337e+02 6.991401142001761e+02 + 3 7.499999999999997e+02 2.469644380816110e+02 1.127460289287336e+02 -6.991401142001760e+02 + ME 1.035095560753067e+01 + +Event 17 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -7.330123448568297e+02 -1.270424905118157e+02 -9.514782126800218e+01 + 3 7.500000000000002e+02 7.330123448568297e+02 1.270424905118155e+02 9.514782126800223e+01 + ME 6.820508278271790e+00 + +Event 18 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -1.632088213454307e+02 -7.202207510227291e+02 -1.309387277748307e+02 + 3 7.499999999999995e+02 1.632088213454307e+02 7.202207510227292e+02 1.309387277748307e+02 + ME 6.887964242238246e+00 + +Event 19 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -6.437046519440437e+02 3.615612227659719e+02 1.319765254034722e+02 + 3 7.499999999999993e+02 6.437046519440439e+02 -3.615612227659715e+02 -1.319765254034726e+02 + ME 6.743958570143084e+00 + +Event 20 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 3.414032790372056e+02 6.619737876650544e+01 6.645010971451343e+02 + 3 7.499999999999999e+02 -3.414032790372056e+02 -6.619737876650578e+01 -6.645010971451345e+02 + ME 1.009276680327207e+01 + +Event 21 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -6.873372127826304e+02 1.330214527069504e+02 -2.690220233436987e+02 + 3 7.500000000000001e+02 6.873372127826304e+02 -1.330214527069503e+02 2.690220233436988e+02 + ME 7.341687749091427e+00 + +Event 22 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.102099953078523e+02 -1.032640414901363e+02 4.236511529094595e+02 + 3 7.500000000000001e+02 -6.102099953078524e+02 1.032640414901364e+02 -4.236511529094595e+02 + ME 7.715954336780805e+00 + +Event 23 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.344895803220364e+02 4.175443017228516e+02 3.201212967391421e+02 + 3 7.500000000000003e+02 5.344895803220364e+02 -4.175443017228515e+02 -3.201212967391419e+02 + ME 7.177125103351771e+00 + +Event 24 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000028e+02 1.418425161536915e+02 -5.777728680811459e+02 -4.566828369015786e+02 + 3 7.499999999999993e+02 -1.418425161536913e+02 5.777728680811460e+02 4.566828369015801e+02 + ME 8.748460978805161e+00 + +Event 25 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.716447327703473e+02 -2.655667876457525e+02 -5.948566834991209e+02 + 3 7.500000000000003e+02 3.716447327703468e+02 2.655667876457529e+02 5.948566834991209e+02 + ME 1.078832563764083e+01 + +Event 26 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 6.982684047175284e+02 2.520499043566507e+02 1.067336904966799e+02 + 3 7.499999999999986e+02 -6.982684047175262e+02 -2.520499043566515e+02 -1.067336904966817e+02 + ME 6.723921540420133e+00 + +Event 27 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -5.841422120170263e+02 -2.875466445556268e+02 3.722832300072828e+02 + 3 7.499999999999998e+02 5.841422120170263e+02 2.875466445556265e+02 -3.722832300072828e+02 + ME 7.414372500856212e+00 + +Event 28 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -5.475651347914466e+02 9.856189468229891e-02 -5.125157689733829e+02 + 3 7.500000000000001e+02 5.475651347914466e+02 -9.856189468231821e-02 5.125157689733829e+02 + ME 9.446623345558731e+00 + +Event 29 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -4.650224466071159e+02 8.783714288524543e+01 -5.818408377565888e+02 + 3 7.500000000000001e+02 4.650224466071160e+02 -8.783714288524527e+01 5.818408377565887e+02 + ME 1.055105048565986e+01 + +Event 30 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.782526290173227e+02 -4.370648563926671e+02 -5.828381521098888e+02 + 3 7.499999999999997e+02 -1.782526290173224e+02 4.370648563926672e+02 5.828381521098888e+02 + ME 1.056891270802689e+01 + +Event 31 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 5.241445908528015e+01 -6.984630617313879e+02 -2.681456243827462e+02 + 3 7.500000000000000e+02 -5.241445908528016e+01 6.984630617313879e+02 2.681456243827462e+02 + ME 7.337621069530262e+00 + +Event 32 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 4.502669185499855e+02 -4.340701284735809e+02 -4.139357747603256e+02 + 3 7.500000000000048e+02 -4.502669185499816e+02 4.340701284735767e+02 4.139357747603273e+02 + ME 8.315221954301407e+00 + +Event 33 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 3.654307823639502e+02 -1.558862065836153e+02 -6.361287871947331e+02 + 3 7.499999999999998e+02 -3.654307823639500e+02 1.558862065836156e+02 6.361287871947331e+02 + ME 1.159229014558217e+01 + +Event 34 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.482796593496792e+02 5.744589614092289e+02 4.588137359318844e+02 + 3 7.499999999999999e+02 1.482796593496794e+02 -5.744589614092289e+02 -4.588137359318844e+02 + ME 7.966939850618117e+00 + +Event 35 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.948173619449916e+02 -4.238736069615458e+02 1.703627636844682e+02 + 3 7.500000000000002e+02 5.948173619449916e+02 4.238736069615458e+02 -1.703627636844681e+02 + ME 6.789370067084201e+00 + +Event 36 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.826165317041465e+02 4.475925182298392e+02 -5.313086048919125e+02 + 3 7.500000000000002e+02 -2.826165317041465e+02 -4.475925182298392e+02 5.313086048919126e+02 + ME 9.719098323354352e+00 + +Event 37 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 5.291270169087664e+02 -1.025561575994626e+02 5.215427446678408e+02 + 3 7.500000000000002e+02 -5.291270169087662e+02 1.025561575994624e+02 -5.215427446678408e+02 + ME 8.514164241866695e+00 + +Event 38 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.783993127041845e+02 -6.682289316640183e+02 2.900754731338715e+02 + 3 7.499999999999999e+02 1.783993127041845e+02 6.682289316640183e+02 -2.900754731338714e+02 + ME 7.067680260537180e+00 + +Event 39 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999968e+02 7.035220480385162e+02 -2.585746933707225e+02 2.637908019484448e+01 + 3 7.499999999999977e+02 -7.035220480385176e+02 2.585746933707186e+02 -2.637908019484159e+01 + ME 6.708133168849063e+00 + +Event 40 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.045990218130194e+00 -7.391865839862929e+02 -1.268940501329283e+02 + 3 7.499999999999998e+02 -1.045990218130205e+00 7.391865839862932e+02 1.268940501329283e+02 + ME 6.879461741963940e+00 + +Event 41 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 4.516393832142920e+02 -5.982487717061383e+02 2.490531433069630e+01 + 3 7.500000000000000e+02 -4.516393832142919e+02 5.982487717061383e+02 -2.490531433069630e+01 + ME 6.708503862411032e+00 + +Event 42 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -3.404063782711548e+02 -2.677733538887420e+02 -6.123078707476251e+02 + 3 7.499999999999997e+02 3.404063782711548e+02 2.677733538887418e+02 6.123078707476250e+02 + ME 1.111975281033072e+01 + +Event 43 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 4.431989925840303e+02 -4.375135005185253e+02 4.179193580543111e+02 + 3 7.500000000000001e+02 -4.431989925840303e+02 4.375135005185250e+02 -4.179193580543110e+02 + ME 7.678612274608614e+00 + +Event 44 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.639601113951325e+02 7.019963091374882e+02 -5.122650643294619e+00 + 3 7.499999999999998e+02 -2.639601113951325e+02 -7.019963091374882e+02 5.122650643294828e+00 + ME 6.721205907630888e+00 + +Event 45 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.618223353367828e+02 2.920485830812342e+02 5.137389957014393e+02 + 3 7.499999999999999e+02 4.618223353367827e+02 -2.920485830812343e+02 -5.137389957014392e+02 + ME 8.438965419463550e+00 + +Event 46 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -7.051112468437969e+02 2.555740963349754e+02 1.041966542073612e-01 + 3 7.499999999999887e+02 7.051112468437901e+02 -2.555740963349691e+02 -1.041966542099173e-01 + ME 6.718286483864190e+00 + +Event 47 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.814786796723017e+02 -1.264994422665163e+02 -6.332234266526608e+02 + 3 7.499999999999999e+02 3.814786796723017e+02 1.264994422665163e+02 6.332234266526607e+02 + ME 1.153369028370769e+01 + +Event 48 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.223718392138900e+02 -9.075826002414991e+01 5.304626281216111e+02 + 3 7.499999999999998e+02 5.223718392138896e+02 9.075826002414973e+01 -5.304626281216111e+02 + ME 8.602555394557349e+00 + +Event 49 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.512062929704629e+02 5.497568647487626e+02 -4.440301656798211e+02 + 3 7.500000000000018e+02 -2.512062929704655e+02 -5.497568647487626e+02 4.440301656798209e+02 + ME 8.611721346816333e+00 + +Event 50 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.081599294091976e+02 -4.985692845883419e+02 3.838412959671586e+02 + 3 7.500000000000003e+02 4.081599294091976e+02 4.985692845883421e+02 -3.838412959671589e+02 + ME 7.475951923389490e+00 + +Event 51 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000015e+02 6.013537967828095e+02 1.484120711712792e+02 4.229036157631543e+02 + 3 7.500000000000006e+02 -6.013537967828099e+02 -1.484120711712781e+02 -4.229036157631560e+02 + ME 7.711029108307889e+00 + +Event 52 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.788389248314356e+01 -6.412729991041272e+02 3.870836489834254e+02 + 3 7.500000000000001e+02 -3.788389248314355e+01 6.412729991041272e+02 -3.870836489834253e+02 + ME 7.493857307167884e+00 + +Event 53 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.721956772389075e+02 -5.973666994060347e+02 2.590818486220348e+02 + 3 7.500000000000000e+02 3.721956772389076e+02 5.973666994060347e+02 -2.590818486220348e+02 + ME 6.973074230015372e+00 + +Event 54 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.279511131417045e+02 4.100901816942154e+02 1.855365759053465e+00 + 3 7.500000000000002e+02 -6.279511131417045e+02 -4.100901816942154e+02 -1.855365759053460e+00 + ME 6.717375460640644e+00 + +Event 55 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999990e+02 2.906696961973026e+02 5.783283149374521e+02 -3.788766129680944e+02 + 3 7.500000000000005e+02 -2.906696961973020e+02 -5.783283149374525e+02 3.788766129680951e+02 + ME 8.016392208079195e+00 + +Event 56 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.052921399041344e+02 -6.285366974145303e+02 -3.539728199972656e+02 + 3 7.500000000000001e+02 -2.052921399041342e+02 6.285366974145303e+02 3.539728199972656e+02 + ME 7.831358184398179e+00 + +Event 57 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.566747271383028e+02 -2.172482942051992e+01 -5.021367144037026e+02 + 3 7.500000000000002e+02 -5.566747271383028e+02 2.172482942051981e+01 5.021367144037026e+02 + ME 9.304505100652069e+00 + +Event 58 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -1.111752699385216e+02 -4.460916785360812e+02 -5.925725893888097e+02 + 3 7.500000000000000e+02 1.111752699385215e+02 4.460916785360812e+02 5.925725893888094e+02 + ME 1.074604632868053e+01 + +Event 59 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 8.715681773052808e+00 -1.696629445817668e+02 -7.305056619404637e+02 + 3 7.499999999999999e+02 -8.715681773052719e+00 1.696629445817669e+02 7.305056619404637e+02 + ME 1.329638337603891e+01 + +Event 60 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 6.928032817602112e+02 2.624225493926035e+02 1.168675247986692e+02 + 3 7.499999999999998e+02 -6.928032817602113e+02 -2.624225493926033e+02 -1.168675247986697e+02 + ME 6.731062824431854e+00 + +Event 61 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -3.549971539677355e+02 2.839011090540920e+02 -5.965544241330073e+02 + 3 7.499999999999994e+02 3.549971539677352e+02 -2.839011090540920e+02 5.965544241330074e+02 + ME 1.081992315160001e+01 + +Event 62 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.075091268532093e+02 5.438463972512828e+02 4.149328055225045e+02 + 3 7.499999999999998e+02 3.075091268532093e+02 -5.438463972512828e+02 -4.149328055225045e+02 + ME 7.659537525677337e+00 + +Event 63 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.476352128383518e+02 -1.524963074254458e+02 4.892244371258421e+02 + 3 7.499999999999998e+02 -5.476352128383518e+02 1.524963074254458e+02 -4.892244371258423e+02 + ME 8.215908260963062e+00 + +Event 64 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 3.967977363399760e+02 1.264105709968621e+02 -6.237563017523095e+02 + 3 7.499999999999999e+02 -3.967977363399760e+02 -1.264105709968621e+02 6.237563017523095e+02 + ME 1.134441176821248e+01 + +Event 65 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.816326009293766e+02 -1.978646582155084e+02 2.423356646874607e+02 + 3 7.499999999999998e+02 -6.816326009293764e+02 1.978646582155085e+02 -2.423356646874606e+02 + ME 6.929028434219458e+00 + +Event 66 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.665640119169115e+01 -7.284851156281566e+02 -1.745443913733847e+02 + 3 7.500000000000002e+02 3.665640119169112e+01 7.284851156281566e+02 1.745443913733847e+02 + ME 6.995056396691838e+00 + +Event 67 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 7.153305678246726e+02 1.717790906820774e+02 -1.459250586433670e+02 + 3 7.499999999999997e+02 -7.153305678246727e+02 -1.717790906820775e+02 1.459250586433670e+02 + ME 6.921520761871023e+00 + +Event 68 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.487016905468892e+02 8.269239097845502e+01 -5.952552069277764e+02 + 3 7.500000000000000e+02 4.487016905468892e+02 -8.269239097845502e+01 5.952552069277764e+02 + ME 1.079572967956066e+01 + +Event 69 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999918e+02 4.919500575481741e+02 -2.881927178399906e+02 -4.872679942930399e+02 + 3 7.500000000000083e+02 -4.919500575481738e+02 2.881927178399889e+02 4.872679942930392e+02 + ME 9.110979447599515e+00 + +Event 70 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.773996828232634e+02 1.950566034399255e+02 -6.180796072185035e+02 + 3 7.500000000000000e+02 3.773996828232635e+02 -1.950566034399255e+02 6.180796072185035e+02 + ME 1.123237262674778e+01 + +Event 71 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000014e+02 5.046979384687712e+02 -3.405008111295472e+02 4.379945074147537e+02 + 3 7.500000000000006e+02 -5.046979384687718e+02 3.405008111295484e+02 -4.379945074147531e+02 + ME 7.813713548154583e+00 + +Event 72 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.079545016377724e+02 4.542001484952560e+02 5.594257326540076e+02 + 3 7.499999999999999e+02 2.079545016377724e+02 -4.542001484952560e+02 -5.594257326540076e+02 + ME 8.906605510775098e+00 + +Event 73 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.692745262677147e+02 2.933889353680813e+02 1.688032614990298e+02 + 3 7.499999999999999e+02 6.692745262677147e+02 -2.933889353680814e+02 -1.688032614990298e+02 + ME 6.787157059259481e+00 + +Event 74 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 4.336394009948917e+02 -5.214090606974456e+00 -6.119065967645178e+02 + 3 7.500000000000000e+02 -4.336394009948916e+02 5.214090606974732e+00 6.119065967645176e+02 + ME 1.111197479992554e+01 + +Event 75 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -5.886320510875908e+01 -1.836844277890388e+02 -7.247724843508736e+02 + 3 7.499999999999997e+02 5.886320510875880e+01 1.836844277890388e+02 7.247724843508737e+02 + ME 1.323254919923648e+01 + +Event 76 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.688146793241962e+02 4.830467602600369e+02 3.307243925875947e+02 + 3 7.499999999999999e+02 -4.688146793241962e+02 -4.830467602600369e+02 -3.307243925875947e+02 + ME 7.220270925734456e+00 + +Event 77 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -6.339445789702437e+02 -2.102547003631318e+02 -3.411850403658548e+02 + 3 7.499999999999999e+02 6.339445789702437e+02 2.102547003631318e+02 3.411850403658547e+02 + ME 7.744348984620499e+00 + +Event 78 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.669825029292291e+02 -6.850785338298775e+02 1.479450763464258e+02 + 3 7.499999999999999e+02 -2.669825029292290e+02 6.850785338298771e+02 -1.479450763464256e+02 + ME 6.760593630776225e+00 + +Event 79 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -3.533967556727068e+02 -1.417706364842367e+02 -6.461515454681359e+02 + 3 7.499999999999997e+02 3.533967556727068e+02 1.417706364842367e+02 6.461515454681356e+02 + ME 1.179577225123097e+01 + +Event 80 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -4.262373668943079e+02 -6.163135238556600e+02 -3.129452628857666e+01 + 3 7.500000000000006e+02 4.262373668943078e+02 6.163135238556595e+02 3.129452628857673e+01 + ME 6.740380400369218e+00 + +Event 81 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -7.465617073786625e+02 6.074723773897961e+01 -3.814957644509608e+01 + 3 7.499999999999997e+02 7.465617073786623e+02 -6.074723773897957e+01 3.814957644509607e+01 + ME 6.746675982105315e+00 + +Event 82 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 7.475659550619686e+02 -4.211883263705306e+00 6.022792435124191e+01 + 3 7.499999999999999e+02 -7.475659550619688e+02 4.211883263705413e+00 -6.022792435124152e+01 + ME 6.706112288324684e+00 + +Event 83 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 6.388305538878824e+02 2.707043584000301e+02 -2.848063794273738e+02 + 3 7.499999999999997e+02 -6.388305538878824e+02 -2.707043584000302e+02 2.848063794273737e+02 + ME 7.418064969701935e+00 + +Event 84 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -1.782135480622982e+02 -6.945359840166780e+02 -2.199083859088392e+02 + 3 7.499999999999990e+02 1.782135480622982e+02 6.945359840166778e+02 2.199083859088392e+02 + ME 7.139764782597618e+00 + +Event 85 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.627525077696845e+02 1.216078070177454e+02 -6.450753129512397e+02 + 3 7.499999999999998e+02 3.627525077696845e+02 -1.216078070177455e+02 6.450753129512397e+02 + ME 1.177384726051603e+01 + +Event 86 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -5.904296033356202e+02 8.239172209485156e+01 4.550873406668074e+02 + 3 7.500000000000000e+02 5.904296033356203e+02 -8.239172209485153e+01 -4.550873406668076e+02 + ME 7.938505758624281e+00 + +Event 87 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.743433600636469e+01 -2.654341936409746e+02 6.982100680837725e+02 + 3 7.499999999999998e+02 -6.743433600636463e+01 2.654341936409747e+02 -6.982100680837727e+02 + ME 1.034681898488622e+01 + +Event 88 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.123842692071389e+02 6.372819831147647e+02 -2.424618368923953e+02 + 3 7.499999999999999e+02 3.123842692071390e+02 -6.372819831147647e+02 2.424618368923954e+02 + ME 7.226137648130027e+00 + +Event 89 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000010e+02 2.445149472352194e+02 2.427088730721040e+02 -6.661867932574361e+02 + 3 7.499999999999997e+02 -2.445149472352192e+02 -2.427088730721042e+02 6.661867932574365e+02 + ME 1.220403413853937e+01 + +Event 90 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -6.884814144916903e+01 7.440438304666905e+02 -6.448807458320992e+01 + 3 7.499999999999998e+02 6.884814144916899e+01 -7.440438304666906e+02 6.448807458320998e+01 + ME 6.775925612327670e+00 + +Event 91 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999941e+02 4.141760802881196e+02 -5.972800904748360e+02 1.849720736747363e+02 + 3 7.499999999999908e+02 -4.141760802881158e+02 5.972800904748445e+02 -1.849720736747465e+02 + ME 6.811684740012724e+00 + +Event 92 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.924404784338581e+01 6.957303155217185e+02 2.773431678857729e+02 + 3 7.499999999999999e+02 -3.924404784338569e+01 -6.957303155217185e+02 -2.773431678857732e+02 + ME 7.026676837371092e+00 + +Event 93 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -4.290083413418520e+02 -2.957602135933497e+02 5.394235248061854e+02 + 3 7.500000000000001e+02 4.290083413418521e+02 2.957602135933496e+02 -5.394235248061852e+02 + ME 8.693913124971472e+00 + +Event 94 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.430046970424431e+02 5.127268066666063e+02 6.897188351284326e+01 + 3 7.500000000000002e+02 5.430046970424431e+02 -5.127268066666062e+02 -6.897188351284419e+01 + ME 6.707622073154013e+00 + +Event 95 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.670174710353041e+02 -4.266096910567016e+02 -5.938091746221822e+02 + 3 7.500000000000002e+02 -1.670174710353047e+02 4.266096910567017e+02 5.938091746221819e+02 + ME 1.076890279995460e+01 + +Event 96 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.630781801178598e+02 4.564206587749570e+02 -5.338258642940684e+02 + 3 7.499999999999998e+02 2.630781801178598e+02 -4.564206587749571e+02 5.338258642940683e+02 + ME 9.757103324657161e+00 + +Event 97 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.518870680096824e+02 -5.345219674188427e+02 -3.911032571000130e+02 + 3 7.500000000000002e+02 -3.518870680096824e+02 5.345219674188428e+02 3.911032571000131e+02 + ME 8.115274589640631e+00 + +Event 98 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 4.752074620001102e+02 -5.701512527722713e+02 -1.077284410990112e+02 + 3 7.499999999999998e+02 -4.752074620001101e+02 5.701512527722716e+02 1.077284410990111e+02 + ME 6.842255369335994e+00 + +Event 99 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.725046135512980e+02 -2.442507613156967e+02 6.033919774737652e+02 + 3 7.499999999999999e+02 3.725046135512980e+02 2.442507613156966e+02 -6.033919774737648e+02 + ME 9.407655642567324e+00 + +Event 100 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.744099165829508e+02 -2.914668115726682e+02 1.507261164040270e+02 + 3 7.499999999999997e+02 6.744099165829508e+02 2.914668115726678e+02 -1.507261164040270e+02 + ME 6.763814288573121e+00 + +Event 101 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.607743247254628e+01 -6.958012347235277e+02 2.794676330495263e+02 + 3 7.500000000000000e+02 1.607743247254616e+01 6.958012347235277e+02 -2.794676330495263e+02 + ME 7.033306362386790e+00 + +Event 102 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 6.752700687038075e+02 -3.061796867633289e+02 -1.129793508844210e+02 + 3 7.499999999999999e+02 -6.752700687038073e+02 3.061796867633290e+02 1.129793508844209e+02 + ME 6.851951413315867e+00 + +Event 103 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000036e+02 2.709405901562357e+02 -3.521101236710506e+02 6.042430449861624e+02 + 3 7.500000000000044e+02 -2.709405901562365e+02 3.521101236710579e+02 -6.042430449861611e+02 + ME 9.417633948461591e+00 + +Event 104 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -5.896850660222863e+02 -3.933704689676244e+02 2.450126467235562e+02 + 3 7.499999999999992e+02 5.896850660222860e+02 3.933704689676243e+02 -2.450126467235561e+02 + ME 6.935752902964187e+00 + +Event 105 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 -3.232008513548737e+02 2.751892248186203e+02 -6.183139172194096e+02 + 3 7.499999999999998e+02 3.232008513548740e+02 -2.751892248186206e+02 6.183139172194097e+02 + ME 1.123697301201238e+01 + +Event 106 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000009e+02 -6.293500647395244e+02 3.467904123852745e+02 -2.148369285993609e+02 + 3 7.499999999999995e+02 6.293500647395244e+02 -3.467904123852746e+02 2.148369285993609e+02 + ME 7.121728370787759e+00 + +Event 107 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -3.164803773315047e+02 6.485956071491383e+02 -2.041173906137068e+02 + 3 7.500000000000002e+02 3.164803773315048e+02 -6.485956071491387e+02 2.041173906137065e+02 + ME 7.085200277349600e+00 + +Event 108 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.270898770881453e+02 1.543847712987804e+02 -7.228509566520221e+02 + 3 7.500000000000002e+02 -1.270898770881454e+02 -1.543847712987804e+02 7.228509566520221e+02 + ME 1.320872825767113e+01 + +Event 109 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.270964825883011e+02 4.495607724123360e+02 2.873402337840359e+02 + 3 7.499999999999999e+02 -5.270964825883012e+02 -4.495607724123360e+02 -2.873402337840358e+02 + ME 7.058611875105907e+00 + +Event 110 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -7.153664733993946e+02 -2.099256997662553e+02 -8.174355824015288e+01 + 3 7.499999999999999e+02 7.153664733993946e+02 2.099256997662553e+02 8.174355824015289e+01 + ME 6.799579285241573e+00 + +Event 111 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 -4.856864011705253e+02 4.693412062195837e+02 -3.260790576875008e+02 + 3 7.499999999999994e+02 4.856864011705258e+02 -4.693412062195837e+02 3.260790576875021e+02 + ME 7.648096360227617e+00 + +Event 112 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -5.064700140846775e+02 5.272494748911357e+02 1.673203994140850e+02 + 3 7.500000000000001e+02 5.064700140846774e+02 -5.272494748911355e+02 -1.673203994140850e+02 + ME 6.785082575638268e+00 + +Event 113 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 5.589223488778573e+02 3.321278479133653e+02 3.738942371383856e+02 + 3 7.500000000000001e+02 -5.589223488778573e+02 -3.321278479133653e+02 -3.738942371383856e+02 + ME 7.422747981819099e+00 + +Event 114 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.977003556727854e+01 -1.428011991695804e+02 7.345956446068554e+02 + 3 7.499999999999998e+02 4.977003556727847e+01 1.428011991695803e+02 -7.345956446068554e+02 + ME 1.030038653000271e+01 + +Event 115 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.329003126925222e+02 7.085629881653695e+02 7.871426903610268e+01 + 3 7.500000000000002e+02 -2.329003126925222e+02 -7.085629881653696e+02 -7.871426903610255e+01 + ME 6.710297352567093e+00 + +Event 116 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.524145310251806e+01 9.339299182317880e+01 7.440063488879913e+02 + 3 7.499999999999999e+02 1.524145310251813e+01 -9.339299182317893e+01 -7.440063488879913e+02 + ME 1.018802770472233e+01 + +Event 117 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000018e+02 8.625602426427402e+01 6.794638862165430e+02 -3.055956897694018e+02 + 3 7.500000000000000e+02 -8.625602426427714e+01 -6.794638862165400e+02 3.055956897693996e+02 + ME 7.528182745247396e+00 + +Event 118 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 -4.994800131986638e+02 -5.547627166697752e+02 7.251237555226911e+01 + 3 7.500000000000005e+02 4.994800131986640e+02 5.547627166697750e+02 -7.251237555226926e+01 + ME 6.708472792334977e+00 + +Event 119 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.696453710916101e+02 -6.412256459582578e+02 -3.500715912961300e+02 + 3 7.499999999999998e+02 1.696453710916101e+02 6.412256459582578e+02 3.500715912961301e+02 + ME 7.804259582706406e+00 + +Event 120 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 5.909680497736688e+02 -4.472542030924529e+02 -1.149801807392986e+02 + 3 7.500000000000002e+02 -5.909680497736687e+02 4.472542030924529e+02 1.149801807392986e+02 + ME 6.855743737564565e+00 + +Event 121 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 4.403248055966551e+02 -1.878335448594352e+02 5.773496557562031e+02 + 3 7.499999999999999e+02 -4.403248055966551e+02 1.878335448594352e+02 -5.773496557562031e+02 + ME 9.106354766567003e+00 + +Event 122 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000026e+02 -7.911542100677293e+01 4.343577103010812e+02 6.062789206800165e+02 + 3 7.499999999999986e+02 7.911542100677258e+01 -4.343577103010829e+02 -6.062789206800168e+02 + ME 9.441518002649387e+00 + +Event 123 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 4.897831726795230e+02 -1.483823971281492e+02 5.482655451352789e+02 + 3 7.499999999999998e+02 -4.897831726795229e+02 1.483823971281491e+02 -5.482655451352789e+02 + ME 8.786490013031440e+00 + +Event 124 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -2.072168527361443e+02 3.676168647522193e+02 -6.200153358521670e+02 + 3 7.500000000000002e+02 2.072168527361445e+02 -3.676168647522196e+02 6.200153358521671e+02 + ME 1.127044202513034e+01 + +Event 125 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.485400265882382e+02 -5.747265047667000e+02 -4.583942683103899e+02 + 3 7.500000000000001e+02 1.485400265882382e+02 5.747265047667000e+02 4.583942683103898e+02 + ME 8.767533515460121e+00 + +Event 126 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 1.787754338344337e+02 4.314693606084865e+02 5.868334815886133e+02 + 3 7.500000000000003e+02 -1.787754338344337e+02 -4.314693606084866e+02 -5.868334815886132e+02 + ME 9.214897758988224e+00 + +Event 127 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -1.824698245714143e+02 -6.992613640416625e+02 -2.005948849783361e+02 + 3 7.500000000000008e+02 1.824698245714143e+02 6.992613640416617e+02 2.005948849783362e+02 + ME 7.073658623464972e+00 + +Event 128 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.808356150498172e+02 5.501654866182587e+02 -3.387951765355025e+02 + 3 7.500000000000001e+02 -3.808356150498172e+02 -5.501654866182587e+02 3.387951765355026e+02 + ME 7.728659604696491e+00 + +Event 129 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999991e+02 5.909177663532490e+02 -7.106409991592293e+00 4.618069860289239e+02 + 3 7.499999999999995e+02 -5.909177663532489e+02 7.106409991592403e+00 -4.618069860289236e+02 + ME 7.990103037172059e+00 + +Event 130 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -6.434408032869524e+02 1.129979432354521e+02 3.683957077518653e+02 + 3 7.499999999999998e+02 6.434408032869527e+02 -1.129979432354514e+02 -3.683957077518652e+02 + ME 7.394434134350836e+00 + +Event 131 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000010e+02 6.746515082539581e+02 1.884449229635105e+02 2.680183826157036e+02 + 3 7.500000000000008e+02 -6.746515082539582e+02 -1.884449229635102e+02 -2.680183826157018e+02 + ME 6.998558277413073e+00 + +Event 132 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.558298868453927e+02 9.921998690776338e+01 -6.980017644638850e+02 + 3 7.499999999999999e+02 2.558298868453927e+02 -9.921998690776337e+01 6.980017644638850e+02 + ME 1.282016204575895e+01 + +Event 133 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 3.135802854217641e+02 -4.216148118485027e+02 5.351713324018350e+02 + 3 7.499999999999995e+02 -3.135802854217633e+02 4.216148118485037e+02 -5.351713324018345e+02 + ME 8.650246219466727e+00 + +Event 134 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.181187851375607e+02 2.100869264433576e+02 7.102192872171315e+02 + 3 7.500000000000000e+02 1.181187851375605e+02 -2.100869264433577e+02 -7.102192872171314e+02 + ME 1.038297320722016e+01 + +Event 135 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.827401847882261e+02 -7.210896591418211e+02 -9.558100408709542e+01 + 3 7.500000000000001e+02 1.827401847882261e+02 7.210896591418210e+02 9.558100408709542e+01 + ME 6.821222819532189e+00 + +Event 136 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999955e+02 1.347097060534654e+02 3.165403007658664e+02 6.664499479226020e+02 + 3 7.499999999999990e+02 -1.347097060534688e+02 -3.165403007658659e+02 -6.664499479226026e+02 + ME 1.011155786167805e+01 + +Event 137 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.709304925774791e+02 -4.143354602318728e+02 -2.546850977966301e+02 + 3 7.500000000000000e+02 -5.709304925774790e+02 4.143354602318728e+02 2.546850977966302e+02 + ME 7.277372583497503e+00 + +Event 138 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.230024938116915e+02 -6.078146322618763e+01 7.134952708195409e+02 + 3 7.500000000000001e+02 2.230024938116915e+02 6.078146322618760e+01 -7.134952708195409e+02 + ME 1.038551596876548e+01 + +Event 139 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -4.938095139105243e+02 -4.125095274968338e+02 3.853414767395158e+02 + 3 7.500000000000000e+02 4.938095139105243e+02 4.125095274968338e+02 -3.853414767395157e+02 + ME 7.484201616719025e+00 + +Event 140 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999987e+02 2.150639064959607e+02 7.154069121581585e+02 -6.663682284611133e+01 + 3 7.499999999999993e+02 -2.150639064959593e+02 -7.154069121581598e+02 6.663682284611147e+01 + ME 6.778673613026928e+00 + +Event 141 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.001994340787286e+02 -3.935615219448572e+02 2.176464283557658e+02 + 3 7.500000000000001e+02 -6.001994340787286e+02 3.935615219448572e+02 -2.176464283557658e+02 + ME 6.872437476965886e+00 + +Event 142 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 3.402076114496817e+02 6.461448067635170e+02 -1.710428887859415e+02 + 3 7.500000000000002e+02 -3.402076114496816e+02 -6.461448067635171e+02 1.710428887859415e+02 + ME 6.985363442268832e+00 + +Event 143 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999960e+02 -2.483174680463629e+02 4.871846387911277e+02 -5.133123442788001e+02 + 3 7.499999999999978e+02 2.483174680463647e+02 -4.871846387911270e+02 5.133123442788014e+02 + ME 9.457774006071118e+00 + +Event 144 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.342206172984323e+01 7.464718320769291e+02 -6.878379852869928e+01 + 3 7.500000000000002e+02 2.342206172984324e+01 -7.464718320769290e+02 6.878379852869917e+01 + ME 6.781474939365426e+00 + +Event 145 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999964e+02 6.139969952391264e+02 -3.976063262667788e+02 -1.655804914534161e+02 + 3 7.499999999999991e+02 -6.139969952391292e+02 3.976063262667798e+02 1.655804914534176e+02 + ME 6.970638391385144e+00 + +Event 146 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -7.489741806317041e+02 -1.872912227594817e+01 3.445136754673254e+01 + 3 7.500000000000000e+02 7.489741806317041e+02 1.872912227594822e+01 -3.445136754673256e+01 + ME 6.706520820011854e+00 + +Event 147 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.413828145683407e+02 3.157741422579704e+02 -6.654303801495521e+02 + 3 7.500000000000000e+02 1.413828145683408e+02 -3.157741422579705e+02 6.654303801495525e+02 + ME 1.218870627538836e+01 + +Event 148 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -6.671561067841420e+02 2.284674470436643e+02 2.553533920314686e+02 + 3 7.500000000000002e+02 6.671561067841420e+02 -2.284674470436643e+02 -2.553533920314689e+02 + ME 6.962855031224713e+00 + +Event 149 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 3.495458719419115e+02 -1.546896579009742e+02 -6.452819485673282e+02 + 3 7.499999999999998e+02 -3.495458719419115e+02 1.546896579009742e+02 6.452819485673284e+02 + ME 1.177805579456881e+01 + +Event 150 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000016e+02 -5.603068721757455e+01 7.476532040007055e+02 1.937133545739349e+01 + 3 7.499999999999998e+02 5.603068721757454e+01 -7.476532040007073e+02 -1.937133545739277e+01 + ME 6.710106329345492e+00 + +Event 151 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 -2.349239539523203e+01 2.915649777816018e+02 -6.906069584718988e+02 + 3 7.499999999999984e+02 2.349239539523064e+01 -2.915649777816013e+02 6.906069584718991e+02 + ME 1.268438575333683e+01 + +Event 152 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 1.325883219266814e+02 7.097564553439871e+02 2.028943345345158e+02 + 3 7.499999999999999e+02 -1.325883219266814e+02 -7.097564553439881e+02 -2.028943345345148e+02 + ME 6.843093428225990e+00 + +Event 153 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.538006601590899e+02 -6.984195271964691e+02 -1.014661960136019e+02 + 3 7.499999999999998e+02 2.538006601590899e+02 6.984195271964689e+02 1.014661960136019e+02 + ME 6.831171350097628e+00 + +Event 154 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -1.323172710781251e+02 -6.860984228486730e+02 2.725088878165298e+02 + 3 7.500000000000003e+02 1.323172710781252e+02 6.860984228486727e+02 -2.725088878165298e+02 + ME 7.011901545030085e+00 + +Event 155 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.452704712456864e+02 -4.996070210616754e+02 1.247915770129769e+02 + 3 7.500000000000001e+02 5.452704712456864e+02 4.996070210616755e+02 -1.247915770129769e+02 + ME 6.737486695665050e+00 + +Event 156 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -1.756428583566509e+02 5.398397609473054e+02 -4.901251052625871e+02 + 3 7.499999999999972e+02 1.756428583566513e+02 -5.398397609473053e+02 4.901251052625872e+02 + ME 9.147262116685511e+00 + +Event 157 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 -7.343271291233364e+02 5.984001257480936e+01 1.402955463602692e+02 + 3 7.500000000000000e+02 7.343271291233360e+02 -5.984001257480933e+01 -1.402955463602694e+02 + ME 6.752233034997561e+00 + +Event 158 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 5.956876772329626e+02 -3.688982317556030e+02 -2.675262338545274e+02 + 3 7.500000000000003e+02 -5.956876772329629e+02 3.688982317556030e+02 2.675262338545274e+02 + ME 7.334757758634730e+00 + +Event 159 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 4.610896376803973e+02 -1.089359794187622e+02 5.814028710041216e+02 + 3 7.500000000000002e+02 -4.610896376803973e+02 1.089359794187622e+02 -5.814028710041216e+02 + ME 9.152535455729591e+00 + +Event 160 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 7.080441809468051e+02 2.091039913840824e+02 -1.320945063756017e+02 + 3 7.500000000000006e+02 -7.080441809468048e+02 -2.091039913840823e+02 1.320945063756016e+02 + ME 6.890436513684910e+00 + +Event 161 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.069769124239759e+02 3.104285773922232e+02 3.125909885497297e+02 + 3 7.499999999999999e+02 -6.069769124239759e+02 -3.104285773922233e+02 -3.125909885497299e+02 + ME 7.147953040125858e+00 + +Event 162 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.828417389515272e+01 3.828517565470377e+02 6.412969702909132e+02 + 3 7.500000000000001e+02 -6.828417389515273e+01 -3.828517565470376e+02 -6.412969702909131e+02 + ME 9.847520394996939e+00 + +Event 163 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.807789436749607e+02 6.954089716815359e+02 -8.339478017354233e+00 + 3 7.500000000000001e+02 -2.807789436749607e+02 -6.954089716815359e+02 8.339478017354306e+00 + ME 6.723152148094316e+00 + +Event 164 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 3.142343236884305e+02 3.091601587257571e+02 -6.067757296338901e+02 + 3 7.500000000000000e+02 -3.142343236884303e+02 -3.091601587257571e+02 6.067757296338900e+02 + ME 1.101314600029087e+01 + +Event 165 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -5.065756695626631e+02 4.857807362514072e+02 2.643826153403073e+02 + 3 7.500000000000000e+02 5.065756695626630e+02 -4.857807362514070e+02 -2.643826153403071e+02 + ME 6.988019936149384e+00 + +Event 166 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.582369138095781e+02 -7.322863531814778e+02 3.489664815125224e+01 + 3 7.500000000000001e+02 -1.582369138095780e+02 7.322863531814778e+02 -3.489664815125219e+01 + ME 6.706452454586795e+00 + +Event 167 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 4.506018848986033e+02 -1.448154652770772e+02 -5.817958596813918e+02 + 3 7.499999999999999e+02 -4.506018848986032e+02 1.448154652770771e+02 5.817958596813918e+02 + ME 1.055024618386568e+01 + +Event 168 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.383838657580176e+02 4.266457451013772e+02 -6.011017500263642e+02 + 3 7.499999999999999e+02 -1.383838657580177e+02 -4.266457451013772e+02 6.011017500263641e+02 + ME 1.090526315852438e+01 + +Event 169 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 4.787059935025895e+02 5.010202352445186e+02 -2.869133940063202e+02 + 3 7.499999999999999e+02 -4.787059935025894e+02 -5.010202352445185e+02 2.869133940063202e+02 + ME 7.428720891758926e+00 + +Event 170 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.735853015668400e+02 3.083708463739302e+01 6.976389875699663e+02 + 3 7.500000000000000e+02 2.735853015668399e+02 -3.083708463739308e+01 -6.976389875699663e+02 + ME 1.034417906633690e+01 + +Event 171 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 6.071409269801585e+01 6.637658831421409e+02 -3.438439345464608e+02 + 3 7.500000000000001e+02 -6.071409269801596e+01 -6.637658831421408e+02 3.438439345464608e+02 + ME 7.762013234593232e+00 + +Event 172 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -7.459627102375579e+02 -5.960011581968052e+01 4.987445367439310e+01 + 3 7.500000000000001e+02 7.459627102375570e+02 5.960011581968127e+01 -4.987445367439175e+01 + ME 6.705407121449587e+00 + +Event 173 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.227626126251467e+02 -8.611217040357174e+01 5.308520932893799e+02 + 3 7.500000000000002e+02 5.227626126251467e+02 8.611217040357153e+01 -5.308520932893800e+02 + ME 8.606473232789599e+00 + +Event 174 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 7.427899788159028e+02 8.606171690349171e+01 -5.793468955896282e+01 + 3 7.500000000000015e+02 -7.427899788159044e+02 -8.606171690349211e+01 5.793468955896348e+01 + ME 6.767885826780614e+00 + +Event 175 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 1.880595469134288e+02 1.952277576903688e+02 6.992994562002083e+02 + 3 7.500000000000001e+02 -1.880595469134288e+02 -1.952277576903685e+02 -6.992994562002083e+02 + ME 1.035164381182860e+01 + +Event 176 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.956039433245556e+02 6.710800379532824e+02 -1.573845333938319e+02 + 3 7.500000000000003e+02 2.956039433245558e+02 -6.710800379532825e+02 1.573845333938319e+02 + ME 6.949432589737475e+00 + +Event 177 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.644810965455426e+02 -6.588907599528918e+02 2.416872276699125e+02 + 3 7.499999999999998e+02 -2.644810965455426e+02 6.588907599528918e+02 -2.416872276699126e+02 + ME 6.927417362964576e+00 + +Event 178 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.046483864607851e+02 3.100401775516633e+02 3.174514404662580e+02 + 3 7.500000000000000e+02 6.046483864607851e+02 -3.100401775516633e+02 -3.174514404662580e+02 + ME 7.166644623608519e+00 + +Event 179 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000080e+02 -7.554400969691378e+01 -5.342383817568482e+02 -5.209438108440585e+02 + 3 7.500000000000013e+02 7.554400969691604e+01 5.342383817568525e+02 5.209438108440615e+02 + ME 9.566380317553536e+00 + +Event 180 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.109280079817108e+02 7.886792704189078e+01 5.433529430710355e+02 + 3 7.500000000000002e+02 -5.109280079817111e+02 -7.886792704189065e+01 -5.433529430710350e+02 + ME 8.734762457149046e+00 + +Event 181 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -2.142616447442585e+02 -7.154366257261994e+02 -6.886495597177277e+01 + 3 7.500000000000006e+02 2.142616447442585e+02 7.154366257261994e+02 6.886495597177328e+01 + ME 6.781581925104637e+00 + +Event 182 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 6.851592186609105e+02 1.986891791084640e+02 2.314723637697936e+02 + 3 7.500000000000001e+02 -6.851592186609104e+02 -1.986891791084641e+02 -2.314723637697936e+02 + ME 6.902939430785030e+00 + +Event 183 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000267e+02 -4.900054502464741e+02 -4.951952606757387e+02 -2.778062499891736e+02 + 3 7.500000000000108e+02 4.900054502465225e+02 4.951952606757201e+02 2.778062499891378e+02 + ME 7.383450581232615e+00 + +Event 184 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -4.678382693716266e+02 5.778877958512076e+02 -9.835166047197177e+01 + 3 7.500000000000007e+02 4.678382693716272e+02 -5.778877958512081e+02 9.835166047197225e+01 + ME 6.825850391481555e+00 + +Event 185 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -1.061006737744817e+02 -6.192100761952036e+02 4.096602599263592e+02 + 3 7.499999999999998e+02 1.061006737744817e+02 6.192100761952034e+02 -4.096602599263590e+02 + ME 7.626493792801067e+00 + +Event 186 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.455169979042824e+02 -6.058703167657474e+02 3.676174138996159e+02 + 3 7.500000000000002e+02 -2.455169979042824e+02 6.058703167657475e+02 -3.676174138996160e+02 + ME 7.390488395763054e+00 + +Event 187 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000018e+02 -7.262021511245942e+01 -7.131602101967054e+02 -2.205194298677781e+02 + 3 7.500000000000007e+02 7.262021511245977e+01 7.131602101967062e+02 2.205194298677796e+02 + ME 7.141971333297112e+00 + +Event 188 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000033e+02 -7.363140077196462e+02 -6.013110168472254e+01 1.293287773313056e+02 + 3 7.499999999999995e+02 7.363140077196484e+02 6.013110168472303e+01 -1.293287773313050e+02 + ME 6.741501395267847e+00 + +Event 189 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 4.206549846649756e+02 6.188260461572106e+02 -5.102654675699417e+01 + 3 7.500000000000010e+02 -4.206549846649751e+02 -6.188260461572106e+02 5.102654675699346e+01 + ME 6.759961646390165e+00 + +Event 190 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -5.604864850049305e+00 4.279484687014963e+01 -7.487570945048692e+02 + 3 7.500000000000007e+02 5.604864850049609e+00 -4.279484687014988e+01 7.487570945048687e+02 + ME 1.340660148965061e+01 + +Event 191 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 3.197160362814987e+01 2.941727699376930e+02 -6.891590512999375e+02 + 3 7.500000000000001e+02 -3.197160362814988e+01 -2.941727699376931e+02 6.891590512999372e+02 + ME 1.265709206053847e+01 + +Event 192 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.806169100853667e+02 -1.212419534166893e+02 -4.590036954694658e+02 + 3 7.499999999999998e+02 -5.806169100853667e+02 1.212419534166893e+02 4.590036954694659e+02 + ME 8.774358814688139e+00 + +Event 193 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.456318764132128e+02 1.898225735585409e+02 3.310994876570575e+02 + 3 7.500000000000000e+02 6.456318764132128e+02 -1.898225735585409e+02 -3.310994876570575e+02 + ME 7.221842532392818e+00 + +Event 194 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.338176939226098e+02 -1.425870083131823e+02 -3.747586902673791e+02 + 3 7.499999999999999e+02 -6.338176939226098e+02 1.425870083131823e+02 3.747586902673791e+02 + ME 7.984312229802413e+00 + +Event 195 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.241223574217317e+01 6.578529508306934e+02 -3.576740098786882e+02 + 3 7.500000000000000e+02 -4.241223574217290e+01 -6.578529508306934e+02 3.576740098786883e+02 + ME 7.857526923336235e+00 + +Event 196 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.866306133186302e+02 -4.882418460209895e+01 6.913458544798877e+02 + 3 7.499999999999998e+02 -2.866306133186303e+02 4.882418460209907e+01 -6.913458544798879e+02 + ME 1.031037294706617e+01 + +Event 197 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.280026621922505e+02 -1.673514311633879e+02 -5.927117508906939e+02 + 3 7.499999999999998e+02 4.280026621922505e+02 1.673514311633878e+02 5.927117508906939e+02 + ME 1.074861460395256e+01 + +Event 198 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -1.361491583779541e+02 5.918825836234865e+02 4.400436499668658e+02 + 3 7.500000000000001e+02 1.361491583779541e+02 -5.918825836234865e+02 -4.400436499668658e+02 + ME 7.828192287371367e+00 + +Event 199 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -6.634973495764098e+02 2.221316746677197e+01 -3.489668211967610e+02 + 3 7.499999999999995e+02 6.634973495764096e+02 -2.221316746677151e+01 3.489668211967610e+02 + ME 7.796674982756190e+00 + +Event 200 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -8.393568923823412e+01 7.316416839490793e+02 1.419691740493621e+02 + 3 7.499999999999998e+02 8.393568923823399e+01 -7.316416839490793e+02 -1.419691740493622e+02 + ME 6.754000208058978e+00 + +Event 201 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.430360425977634e+02 -7.001150406129024e+02 -1.152059542992958e+02 + 3 7.499999999999981e+02 -2.430360425977634e+02 7.001150406129024e+02 1.152059542992965e+02 + ME 6.856175071986240e+00 + +Event 202 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -6.592948659363004e+02 -1.261773212911234e+02 3.345288677256898e+02 + 3 7.500000000000001e+02 6.592948659363008e+02 1.261773212911236e+02 -3.345288677256897e+02 + ME 7.236357094853454e+00 + +Event 203 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 6.830488054426559e+02 -6.443348473770835e+01 -3.029730275584201e+02 + 3 7.500000000000018e+02 -6.830488054426557e+02 6.443348473770854e+01 3.029730275584199e+02 + ME 7.513667072616929e+00 + +Event 204 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.581562204420950e+02 4.859410156234014e+02 5.489516729825758e+02 + 3 7.500000000000001e+02 -1.581562204420950e+02 -4.859410156234014e+02 -5.489516729825760e+02 + ME 8.793771731391521e+00 + +Event 205 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000024e+02 4.522293593956092e+02 5.926970499435945e+02 8.184627962711241e+01 + 3 7.500000000000011e+02 -4.522293593956088e+02 -5.926970499435940e+02 -8.184627962710897e+01 + ME 6.711381225624119e+00 + +Event 206 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.347787837401271e+02 -3.496041176300521e+02 3.927958927961737e+02 + 3 7.500000000000006e+02 -5.347787837401269e+02 3.496041176300521e+02 -3.927958927961737e+02 + ME 7.526088975835178e+00 + +Event 207 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.012154446331053e+02 6.385938504619652e+02 -2.529172790986055e+02 + 3 7.500000000000001e+02 -3.012154446331053e+02 -6.385938504619652e+02 2.529172790986053e+02 + ME 7.269762007343325e+00 + +Event 208 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999990e+02 7.361476225156938e+02 -7.555700020645618e+01 1.219746595990007e+02 + 3 7.499999999999989e+02 -7.361476225156939e+02 7.555700020645691e+01 -1.219746595989992e+02 + ME 6.735117861201180e+00 + +Event 209 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 2.682577368640644e+02 -6.981157757232997e+02 5.632184566245714e+01 + 3 7.499999999999997e+02 -2.682577368640644e+02 6.981157757232997e+02 -5.632184566245755e+01 + ME 6.705708727813955e+00 + +Event 210 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -9.656834699970371e+01 -3.286793908889068e+01 7.430304522277630e+02 + 3 7.500000000000003e+02 9.656834699970366e+01 3.286793908889081e+01 -7.430304522277632e+02 + ME 1.020234763210611e+01 + +Event 211 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.319690574526541e+02 -6.682701567195197e+02 2.492094581323018e+02 + 3 7.500000000000003e+02 2.319690574526541e+02 6.682701567195195e+02 -2.492094581323018e+02 + ME 6.946534911140684e+00 + +Event 212 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -6.134259033134992e+02 3.540686678370769e+02 2.466658460348640e+02 + 3 7.500000000000002e+02 6.134259033134995e+02 -3.540686678370769e+02 -2.466658460348640e+02 + ME 6.939965008692581e+00 + +Event 213 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -5.285016171336246e+02 5.172862825223078e+02 -1.249037333364969e+02 + 3 7.499999999999999e+02 5.285016171336246e+02 -5.172862825223078e+02 1.249037333364970e+02 + ME 6.875362440442372e+00 + +Event 214 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 3.200786657490619e+02 -3.467878641431868e+02 -5.829132225428646e+02 + 3 7.500000000000001e+02 -3.200786657490619e+02 3.467878641431865e+02 5.829132225428646e+02 + ME 1.057025942160350e+01 + +Event 215 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999967e+02 -6.774094555834016e+02 -2.472477212256578e+02 2.061188827713710e+02 + 3 7.499999999999969e+02 6.774094555834032e+02 2.472477212256506e+02 -2.061188827713709e+02 + ME 6.849232602328065e+00 + +Event 216 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000011e+02 -2.582427834329911e+02 2.233184861234906e+02 6.677870308416393e+02 + 3 7.499999999999992e+02 2.582427834329910e+02 -2.233184861234906e+02 -6.677870308416392e+02 + ME 1.012423773477464e+01 + +Event 217 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 4.526164805502723e+02 -1.109167036758216e+02 -5.876527940714436e+02 + 3 7.500000000000000e+02 -4.526164805502723e+02 1.109167036758217e+02 5.876527940714437e+02 + ME 1.065589657994070e+01 + +Event 218 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000015e+02 -2.774671855265908e+02 -6.964182224530517e+02 2.266319463987561e+01 + 3 7.500000000000018e+02 2.774671855265906e+02 6.964182224530506e+02 -2.266319463987825e+01 + ME 6.709113035666018e+00 + +Event 219 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.962071114522207e+02 4.057112512865967e+02 -2.059986913387148e+02 + 3 7.500000000000000e+02 5.962071114522206e+02 -4.057112512865969e+02 2.059986913387148e+02 + ME 7.091456992226568e+00 + +Event 220 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 5.732003838284645e+02 1.672407953016319e+02 4.538412017058156e+02 + 3 7.500000000000002e+02 -5.732003838284645e+02 -1.672407953016319e+02 -4.538412017058156e+02 + ME 7.929096281033113e+00 + +Event 221 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.373739041416519e+01 6.118806829877552e+02 -4.330572318790715e+02 + 3 7.500000000000000e+02 -2.373739041416523e+01 -6.118806829877552e+02 4.330572318790714e+02 + ME 8.499051226021304e+00 + +Event 222 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -2.926156270671632e+02 -4.951619530302008e+02 4.813426379071445e+02 + 3 7.500000000000002e+02 2.926156270671630e+02 4.951619530302008e+02 -4.813426379071445e+02 + ME 8.148453022506523e+00 + +Event 223 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.745329870696013e+02 3.653875131398772e+01 -7.284937584337815e+02 + 3 7.499999999999999e+02 1.745329870696013e+02 -3.653875131398769e+01 7.284937584337815e+02 + ME 1.327528553966853e+01 + +Event 224 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -5.087597823083424e+02 -4.620767648884736e+02 -3.002474766851023e+02 + 3 7.500000000000006e+02 5.087597823083426e+02 4.620767648884734e+02 3.002474766851025e+02 + ME 7.498776698497275e+00 + +Event 225 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000017e+02 -2.331337537494541e+02 -7.096534890457165e+02 -6.738381369388374e+01 + 3 7.500000000000008e+02 2.331337537494539e+02 7.096534890457161e+02 6.738381369388425e+01 + ME 6.779641953872367e+00 + +Event 226 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.653645699768243e+02 -5.593089569178837e+02 4.233853277009537e+02 + 3 7.499999999999998e+02 -2.653645699768244e+02 5.593089569178836e+02 -4.233853277009537e+02 + ME 7.714201016213133e+00 + +Event 227 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -1.520233915006850e+02 6.293027014655149e+02 -3.786383477209281e+02 + 3 7.500000000000001e+02 1.520233915006851e+02 -6.293027014655149e+02 3.786383477209281e+02 + ME 8.014519504570874e+00 + +Event 228 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999986e+02 -3.734089345660486e+00 1.222238163786131e+02 7.399644554210729e+02 + 3 7.499999999999989e+02 3.734089345661556e+00 -1.222238163786135e+02 -7.399644554210732e+02 + ME 1.024314993836091e+01 + +Event 229 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 -6.305938355568198e+02 2.251166185777011e+02 -3.378963193020604e+02 + 3 7.499999999999991e+02 6.305938355568204e+02 -2.251166185777007e+02 3.378963193020603e+02 + ME 7.722804234534269e+00 + +Event 230 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 1.722967433849823e+02 6.011245480280783e+02 -4.140810427645368e+02 + 3 7.500000000000153e+02 -1.722967433849770e+02 -6.011245480280730e+02 4.140810427645383e+02 + ME 8.316560675818183e+00 + +Event 231 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.986233395749251e+02 5.694313413368092e+02 -3.860984959365697e+02 + 3 7.499999999999997e+02 2.986233395749250e+02 -5.694313413368093e+02 3.860984959365698e+02 + ME 8.074132298236410e+00 + +Event 232 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.772789865710538e+02 -7.124844710497247e+02 -1.530948706957047e+02 + 3 7.500000000000009e+02 -1.772789865710538e+02 7.124844710497254e+02 1.530948706957047e+02 + ME 6.938750215565554e+00 + +Event 233 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.448110289802655e+02 -6.646149940143565e+02 -4.350016120507384e+01 + 3 7.500000000000001e+02 -3.448110289802655e+02 6.646149940143565e+02 4.350016120507379e+01 + ME 6.751963555710427e+00 + +Event 234 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 5.161943612943153e+01 -7.278723602156033e+02 1.733126107582698e+02 + 3 7.500000000000006e+02 -5.161943612943217e+01 7.278723602156028e+02 -1.733126107582698e+02 + ME 6.793644274535075e+00 + +Event 235 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 2.147886171723107e+02 -3.086282984760173e+02 6.489332965051223e+02 + 3 7.499999999999998e+02 -2.147886171723106e+02 3.086282984760172e+02 -6.489332965051223e+02 + ME 9.931792380079020e+00 + +Event 236 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.457991029234601e+02 -2.210062269052574e+02 6.732303087882513e+02 + 3 7.500000000000000e+02 2.457991029234601e+02 2.210062269052574e+02 -6.732303087882513e+02 + ME 1.017390815139612e+01 + +Event 237 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.076996593814685e+02 -4.026323435046960e+02 -5.977023067186026e+02 + 3 7.500000000000000e+02 -2.076996593814686e+02 4.026323435046960e+02 5.977023067186025e+02 + ME 1.084136902896207e+01 + +Event 238 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 5.969032777715726e+02 -4.416010667956443e+02 -1.058063078956159e+02 + 3 7.499999999999999e+02 -5.969032777715722e+02 4.416010667956443e+02 1.058063078956158e+02 + ME 6.838798119820627e+00 + +Event 239 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.188858424502662e+02 7.168183137005395e+02 -2.757703969522984e+01 + 3 7.500000000000000e+02 -2.188858424502661e+02 -7.168183137005394e+02 2.757703969522973e+01 + ME 6.737189454789739e+00 + +Event 240 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 5.906386180567438e+02 -1.043852672012176e+01 -4.621007033320043e+02 + 3 7.499999999999997e+02 -5.906386180567437e+02 1.043852672012184e+01 4.621007033320043e+02 + ME 8.809319077944339e+00 + +Event 241 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 1.310164484338845e+02 -7.028250186529380e+02 -2.266532227771516e+02 + 3 7.500000000000000e+02 -1.310164484338844e+02 7.028250186529378e+02 2.266532227771515e+02 + ME 7.164525068989178e+00 + +Event 242 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.060596880801855e+02 6.058870732428816e+02 3.189487761529025e+02 + 3 7.499999999999998e+02 3.060596880801855e+02 -6.058870732428816e+02 -3.189487761529026e+02 + ME 7.172503641762868e+00 + +Event 243 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.455428611464694e+02 6.177778614271272e-02 -3.818041521596315e+02 + 3 7.499999999999999e+02 -6.455428611464695e+02 -6.177778614265914e-02 3.818041521596315e+02 + ME 8.039569224808659e+00 + +Event 244 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 4.600023964153236e+02 5.665197502294574e+01 -5.896510400382236e+02 + 3 7.499999999999997e+02 -4.600023964153236e+02 -5.665197502294568e+01 5.896510400382238e+02 + ME 1.069235961813599e+01 + +Event 245 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.283205132260828e+02 3.195629486835195e+01 -5.313720264775101e+02 + 3 7.499999999999995e+02 5.283205132260829e+02 -3.195629486835195e+01 5.313720264775098e+02 + ME 9.720051438564610e+00 + +Event 246 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.245676679674773e+02 -7.153172514000221e+02 1.976340945196907e+01 + 3 7.500000000000003e+02 2.245676679674772e+02 7.153172514000222e+02 -1.976340945196914e+01 + ME 6.709981848584103e+00 + +Event 247 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.304482834256935e+02 1.189549860529900e+02 7.037352484843176e+02 + 3 7.499999999999999e+02 2.304482834256935e+02 -1.189549860529900e+02 -7.037352484843176e+02 + ME 1.036827874651352e+01 + +Event 248 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999987e+02 2.537801015626400e+02 -6.571572670245528e+02 -2.573713007459748e+02 + 3 7.499999999999986e+02 -2.537801015626407e+02 6.571572670245536e+02 2.573713007459756e+02 + ME 7.289069236135539e+00 + +Event 249 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.380072251709859e+02 4.007276857381548e+02 5.875967006794294e+02 + 3 7.500000000000009e+02 -2.380072251709867e+02 -4.007276857381549e+02 -5.875967006794293e+02 + ME 9.223703071584543e+00 + +Event 250 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999987e+02 6.786028214456382e-01 -3.391021082103957e+02 6.689613589030948e+02 + 3 7.499999999999984e+02 -6.786028214436315e-01 3.391021082103953e+02 -6.689613589030942e+02 + ME 1.013522498128797e+01 + +Event 251 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 8.268187630824573e+01 3.486690416389526e+02 6.588578046382401e+02 + 3 7.499999999999982e+02 -8.268187630824475e+01 -3.486690416389528e+02 -6.588578046382391e+02 + ME 1.003646321352829e+01 + +Event 252 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -4.102134709467127e+02 -2.983471238084358e+02 5.524616746608593e+02 + 3 7.499999999999973e+02 4.102134709467126e+02 2.983471238084354e+02 -5.524616746608588e+02 + ME 8.831237264272179e+00 + +Event 253 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 -1.361744772971167e+02 -1.872893882889477e+02 -7.133576920221777e+02 + 3 7.499999999999991e+02 1.361744772971159e+02 1.872893882889485e+02 7.133576920221776e+02 + ME 1.307590140769724e+01 + +Event 254 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.137214690822852e+02 -3.619701676967440e+02 2.341443061128602e+02 + 3 7.499999999999999e+02 -6.137214690822851e+02 3.619701676967440e+02 -2.341443061128603e+02 + ME 6.909179964561467e+00 + +Event 255 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 2.766263610363187e+02 -4.252208793533441e+02 5.524174690773034e+02 + 3 7.500000000000002e+02 -2.766263610363191e+02 4.252208793533440e+02 -5.524174690773031e+02 + ME 8.830763204900633e+00 + +Event 0 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 4.886369700926294e+02 -5.166396019380891e+02 -2.383640769242444e+02 + 3 7.499999999999995e+02 -4.886369700926294e+02 5.166396019380888e+02 2.383640769242449e+02 + ME 7.209673891293530e+00 + +Event 1 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 3.808436077386334e+02 -5.054156008216496e+02 4.025086544295820e+02 + 3 7.500000000000003e+02 -3.808436077386334e+02 5.054156008216497e+02 -4.025086544295821e+02 + ME 7.582943642504766e+00 + +Event 2 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999984e+02 5.183034673675862e+02 4.361721238657036e+02 -3.218934514357475e+02 + 3 7.499999999999985e+02 -5.183034673675861e+02 -4.361721238657032e+02 3.218934514357483e+02 + ME 7.622626000975899e+00 + +Event 3 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -7.346631711305880e+02 1.079089156707178e+02 1.054783906926816e+02 + 3 7.499999999999997e+02 7.346631711305879e+02 -1.079089156707179e+02 -1.054783906926816e+02 + ME 6.723119907144046e+00 + +Event 4 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 -3.860135335017675e+02 -5.185727762898845e+02 -3.802312817805505e+02 + 3 7.499999999999998e+02 3.860135335017677e+02 5.185727762898845e+02 3.802312817805510e+02 + ME 8.027078434002204e+00 + +Event 5 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.412146924854130e+01 6.250106998537266e+02 4.131553586783979e+02 + 3 7.500000000000000e+02 3.412146924854130e+01 -6.250106998537266e+02 -4.131553586783979e+02 + ME 7.648308355942532e+00 + +Event 6 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000014e+02 5.103283769498013e+02 -1.882131351255153e+02 -5.163726981996672e+02 + 3 7.499999999999993e+02 -5.103283769498015e+02 1.882131351255153e+02 5.163726981996674e+02 + ME 9.500939222569805e+00 + +Event 7 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000020e+02 -5.382967505952900e+02 -1.493237627632342e+02 5.004388296013622e+02 + 3 7.500000000000024e+02 5.382967505952930e+02 1.493237627632328e+02 -5.004388296013629e+02 + ME 8.315457283078029e+00 + +Event 8 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 -4.598745101308194e+02 -5.621086252728979e+02 -1.872146584158232e+02 + 3 7.499999999999998e+02 4.598745101308190e+02 5.621086252728986e+02 1.872146584158218e+02 + ME 7.031830207565333e+00 + +Event 9 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 1.775786374899709e+02 -3.096805779838334e+02 6.595936378762345e+02 + 3 7.499999999999994e+02 -1.775786374899706e+02 3.096805779838338e+02 -6.595936378762350e+02 + ME 1.004395275726744e+01 + +Event 10 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000016e+02 -5.131238447356292e+02 -5.270922542475881e+02 1.462110648204190e+02 + 3 7.499999999999956e+02 5.131238447356274e+02 5.270922542475843e+02 -1.462110648204196e+02 + ME 6.758634613463200e+00 + +Event 11 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.355823051698720e+02 3.562860272236928e+02 -1.777509498149288e+02 + 3 7.500000000000000e+02 6.355823051698720e+02 -3.562860272236928e+02 1.777509498149290e+02 + ME 7.004109335257140e+00 + +Event 12 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -3.902557542118755e+02 -4.044250918271755e+02 4.966294306674759e+02 + 3 7.499999999999999e+02 3.902557542118753e+02 4.044250918271755e+02 -4.966294306674759e+02 + ME 8.281170236184602e+00 + +Event 13 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.388642282518219e+02 -5.842429835301776e+02 4.050975396832450e+02 + 3 7.500000000000001e+02 -2.388642282518219e+02 5.842429835301778e+02 -4.050975396832449e+02 + ME 7.598541567208300e+00 + +Event 14 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 5.937999534112087e+02 4.550788359673929e+02 5.296100814192662e+01 + 3 7.499999999999994e+02 -5.937999534112080e+02 -4.550788359673928e+02 -5.296100814192649e+01 + ME 6.705494936670632e+00 + +Event 15 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -1.423179755450549e+02 4.493605018245673e+02 -5.833701511362495e+02 + 3 7.500000000000005e+02 1.423179755450543e+02 -4.493605018245677e+02 5.833701511362492e+02 + ME 1.057846297921757e+01 + +Event 16 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.706489889553403e+02 1.222987596040275e+02 -6.886887091979717e+02 + 3 7.500000000000002e+02 2.706489889553404e+02 -1.222987596040272e+02 6.886887091979717e+02 + ME 1.264818185303662e+01 + +Event 17 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -4.991385516924362e+02 -3.979472289417108e+02 3.936987543693655e+02 + 3 7.499999999999992e+02 4.991385516924362e+02 3.979472289417106e+02 -3.936987543693655e+02 + ME 7.531264417786402e+00 + +Event 18 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.716904304441446e+02 6.823352520667955e+02 1.519964269054311e+02 + 3 7.499999999999999e+02 -2.716904304441446e+02 -6.823352520667954e+02 -1.519964269054311e+02 + ME 6.765317828101795e+00 + +Event 19 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -4.773498418349232e+02 4.394024782830364e+02 -3.762480439535861e+02 + 3 7.500000000000002e+02 4.773498418349230e+02 -4.394024782830365e+02 3.762480439535861e+02 + ME 7.995844958324827e+00 + +Event 20 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 2.152723736932615e+01 -4.446684930211732e+02 -6.035780888712939e+02 + 3 7.500000000000002e+02 -2.152723736932629e+01 4.446684930211732e+02 6.035780888712939e+02 + ME 1.095216026484990e+01 + +Event 21 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -4.518768158716143e+02 2.713668944146141e+01 5.979723600333784e+02 + 3 7.500000000000002e+02 4.518768158716146e+02 -2.713668944146124e+01 -5.979723600333788e+02 + ME 9.344233305615944e+00 + +Event 22 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -2.605342203226538e+02 -5.585006861164978e+02 4.274329229812182e+02 + 3 7.499999999999998e+02 2.605342203226538e+02 5.585006861164978e+02 -4.274329229812181e+02 + ME 7.741126345308382e+00 + +Event 23 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.082800789219980e+02 -4.759337282317115e+02 2.786367666021681e+02 + 3 7.500000000000001e+02 -5.082800789219980e+02 4.759337282317115e+02 -2.786367666021681e+02 + ME 7.030703606155221e+00 + +Event 24 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -4.304443270717347e+02 -6.119700149046336e+02 5.206133065322422e+01 + 3 7.499999999999990e+02 4.304443270717353e+02 6.119700149046336e+02 -5.206133065322406e+01 + ME 6.705458617541164e+00 + +Event 25 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 9.357647372272233e+01 -4.354467328122425e+02 6.034315093270853e+02 + 3 7.500000000000001e+02 -9.357647372272231e+01 4.354467328122425e+02 -6.034315093270852e+02 + ME 9.408119040980090e+00 + +Event 26 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.406043478727808e+02 3.407510258582016e+02 -5.747846697561972e+02 + 3 7.499999999999999e+02 3.406043478727807e+02 -3.407510258582016e+02 5.747846697561973e+02 + ME 1.042622452210979e+01 + +Event 27 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999993e+02 -3.284333987867702e+02 6.303438323961643e+02 -2.393703313309438e+02 + 3 7.500000000000003e+02 3.284333987867702e+02 -6.303438323961647e+02 2.393703313309439e+02 + ME 7.213684277924166e+00 + +Event 28 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -1.029133508056655e+02 6.216832636308156e+02 -4.067170539174465e+02 + 3 7.499999999999995e+02 1.029133508056657e+02 -6.216832636308160e+02 4.067170539174464e+02 + ME 8.249778399997188e+00 + +Event 29 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -4.821390153127890e+02 5.538122935981008e+02 1.527544217783090e+02 + 3 7.500000000000006e+02 4.821390153127887e+02 -5.538122935981006e+02 -1.527544217783091e+02 + ME 6.766224713438733e+00 + +Event 30 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 9.489768343899590e+01 -7.281060338646736e+02 1.528268076214603e+02 + 3 7.500000000000000e+02 -9.489768343899605e+01 7.281060338646735e+02 -1.528268076214603e+02 + ME 6.766311698545573e+00 + +Event 31 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 2.588924414604488e+02 -6.567632234918364e+02 -2.532523879912312e+02 + 3 7.500000000000000e+02 -2.588924414604486e+02 6.567632234918364e+02 2.532523879912311e+02 + ME 7.271199389669306e+00 + +Event 32 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.671713815281792e+02 4.353863719603355e+02 4.879793885994619e+02 + 3 7.500000000000005e+02 3.671713815281791e+02 -4.353863719603356e+02 -4.879793885994619e+02 + ME 8.205115130486284e+00 + +Event 33 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 6.319452257542164e+02 1.146829159774965e+02 -3.872893755699342e+02 + 3 7.499999999999990e+02 -6.319452257542164e+02 -1.146829159774965e+02 3.872893755699342e+02 + ME 8.083837430178352e+00 + +Event 34 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.486096093516043e+02 -5.693192358255851e+02 4.202009874536406e+02 + 3 7.499999999999999e+02 -2.486096093516040e+02 5.693192358255851e+02 -4.202009874536406e+02 + ME 7.693360722761708e+00 + +Event 35 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -7.016777587248107e+02 2.605400911432719e+02 -4.761495372235760e+01 + 3 7.500000000000007e+02 7.016777587248106e+02 -2.605400911432719e+02 4.761495372235756e+01 + ME 6.756254647308545e+00 + +Event 36 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -5.992190538187326e+02 -1.415759996521825e+02 -4.282438112373229e+02 + 3 7.499999999999998e+02 5.992190538187326e+02 1.415759996521825e+02 4.282438112373229e+02 + ME 8.451306338470335e+00 + +Event 37 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.677951851527304e+02 1.710101924456866e+02 2.954743724361214e+02 + 3 7.499999999999999e+02 -6.677951851527304e+02 -1.710101924456866e+02 -2.954743724361211e+02 + ME 7.086005658083392e+00 + +Event 38 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.262233168639913e+01 -1.242788051966740e+02 7.369757269768426e+02 + 3 7.500000000000003e+02 6.262233168639904e+01 1.242788051966741e+02 -7.369757269768426e+02 + ME 1.027713948384085e+01 + +Event 39 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 6.702492999956164e+02 -1.498752499034611e+01 3.362160762813851e+02 + 3 7.500000000000001e+02 -6.702492999956164e+02 1.498752499034600e+01 -3.362160762813851e+02 + ME 7.243595166725399e+00 + +Event 40 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.153528524047672e+02 -7.173641694168139e+02 -3.888183892327463e+01 + 3 7.499999999999999e+02 2.153528524047672e+02 7.173641694168140e+02 3.888183892327463e+01 + ME 6.747380196635997e+00 + +Event 41 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999981e+02 -1.577959591347084e+02 1.068869602279798e+02 -7.253796337187790e+02 + 3 7.500000000000070e+02 1.577959591347114e+02 -1.068869602279763e+02 7.253796337187800e+02 + ME 1.323983513171215e+01 + +Event 42 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 2.605757710166903e+02 -3.124498327841133e+02 6.300598142654685e+02 + 3 7.499999999999999e+02 -2.605757710166904e+02 3.124498327841131e+02 -6.300598142654685e+02 + ME 9.719526759137498e+00 + +Event 43 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 7.087179265265711e+02 -2.450238073334029e+02 1.349942442185454e+01 + 3 7.500000000000000e+02 -7.087179265265711e+02 2.450238073334030e+02 -1.349942442185463e+01 + ME 6.712170578206123e+00 + +Event 44 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000063e+02 -1.936709096449502e+01 -3.211501049305117e+02 6.774861813282137e+02 + 3 7.499999999999998e+02 1.936709096449624e+01 3.211501049305155e+02 -6.774861813282184e+02 + ME 1.021031315358551e+01 + +Event 45 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -4.123591650584196e+02 -4.263717314995431e+02 -4.589848206317992e+02 + 3 7.499999999999998e+02 4.123591650584196e+02 4.263717314995430e+02 4.589848206317992e+02 + ME 8.774147158945564e+00 + +Event 46 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 4.417510221679374e+02 1.507214984231038e+02 -5.870596769721774e+02 + 3 7.500000000000003e+02 -4.417510221679374e+02 -1.507214984231037e+02 5.870596769721774e+02 + ME 1.064511422272398e+01 + +Event 47 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000039e+02 3.841729019576128e+02 -2.923622481869822e+02 5.739647177627883e+02 + 3 7.500000000000006e+02 -3.841729019576121e+02 2.923622481869865e+02 -5.739647177627892e+02 + ME 9.068050820770026e+00 + +Event 48 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000010e+02 5.683917324787876e+02 -4.888657012953396e+02 2.100391741506453e+01 + 3 7.500000000000014e+02 -5.683917324787886e+02 4.888657012953391e+02 -2.100391741506501e+01 + ME 6.709598997738912e+00 + +Event 49 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -5.514168436731683e+01 -6.826028811438466e+00 7.479390349325276e+02 + 3 7.500000000000003e+02 5.514168436731685e+01 6.826028811438419e+00 -7.479390349325279e+02 + ME 1.012337334002715e+01 + +Event 50 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.984159499903413e+02 -4.045210751108284e+02 -5.566063425640615e+02 + 3 7.500000000000006e+02 2.984159499903413e+02 4.045210751108285e+02 5.566063425640615e+02 + ME 1.011746246941698e+01 + +Event 51 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -7.047061352594474e+02 -2.553121422897804e+02 -2.655132627753902e+01 + 3 7.500000000000001e+02 7.047061352594474e+02 2.553121422897804e+02 2.655132627753895e+01 + ME 6.736336497283380e+00 + +Event 52 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.049072352300009e+02 -2.374327835145615e+02 -6.812772550606136e+02 + 3 7.499999999999995e+02 -2.049072352300012e+02 2.374327835145616e+02 6.812772550606132e+02 + ME 1.250527245599122e+01 + +Event 53 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.762846372615920e+02 -3.488577990178417e+01 6.963833581971029e+02 + 3 7.499999999999997e+02 -2.762846372615920e+02 3.488577990178413e+01 -6.963833581971029e+02 + ME 1.033811358513441e+01 + +Event 54 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.965754309649124e+02 2.298482753022435e+02 3.921320256107799e+02 + 3 7.499999999999995e+02 5.965754309649125e+02 -2.298482753022435e+02 -3.921320256107795e+02 + ME 7.522297690129307e+00 + +Event 55 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000010e+02 5.249326727483450e+02 -5.207531728510860e+02 1.255460953068927e+02 + 3 7.499999999999985e+02 -5.249326727483449e+02 5.207531728510861e+02 -1.255460953068926e+02 + ME 6.738137243509843e+00 + +Event 56 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -7.488450735514018e+02 -4.113909777397591e+01 6.215340066190963e+00 + 3 7.499999999999995e+02 7.488450735514017e+02 4.113909777397586e+01 -6.215340066191014e+00 + ME 6.715253125950555e+00 + +Event 57 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.609915062325111e+02 7.065463493516471e+02 1.933235400535789e+02 + 3 7.499999999999998e+02 1.609915062325111e+02 -7.065463493516470e+02 -1.933235400535789e+02 + ME 6.825755569201879e+00 + +Event 58 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.364577375600944e+02 -7.175918329662072e+02 1.701212718039087e+02 + 3 7.499999999999998e+02 -1.364577375600944e+02 7.175918329662076e+02 -1.701212718039087e+02 + ME 6.789025273262395e+00 + +Event 59 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -5.772109336552555e+02 -3.832540253228652e+02 2.871304409891938e+02 + 3 7.499999999999995e+02 5.772109336552555e+02 3.832540253228648e+02 -2.871304409891936e+02 + ME 7.057922265389653e+00 + +Event 60 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -3.416667813988472e+02 -5.270418494550994e+02 4.098666849251104e+02 + 3 7.499999999999983e+02 3.416667813988446e+02 5.270418494550997e+02 -4.098666849251105e+02 + ME 7.627772435170511e+00 + +Event 61 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 -4.259888540845538e+02 -6.048881415507170e+02 -1.230602795667551e+02 + 3 7.499999999999998e+02 4.259888540845542e+02 6.048881415507167e+02 1.230602795667549e+02 + ME 6.871615030055534e+00 + +Event 62 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 4.607995913538436e+02 -5.197207754200587e+02 -2.829382480416940e+02 + 3 7.499999999999993e+02 -4.607995913538430e+02 5.197207754200591e+02 2.829382480416940e+02 + ME 7.408709546175889e+00 + +Event 63 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.281800042196735e+02 -1.771681287919234e+02 -7.174129498821536e+02 + 3 7.500000000000003e+02 1.281800042196737e+02 1.771681287919234e+02 7.174129498821536e+02 + ME 1.313550180712908e+01 + +Event 64 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 6.405153128760093e+02 -3.399997059964824e+02 -1.914166499906789e+02 + 3 7.500000000000001e+02 -6.405153128760090e+02 3.399997059964824e+02 1.914166499906790e+02 + ME 7.044628251575686e+00 + +Event 65 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.838295106811600e+02 2.142358614545267e+02 -6.948450954490231e+02 + 3 7.499999999999999e+02 -1.838295106811601e+02 -2.142358614545267e+02 6.948450954490231e+02 + ME 1.276299800831495e+01 + +Event 66 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 5.913261705169197e+02 -1.414127723755367e+02 4.391307184322908e+02 + 3 7.500000000000005e+02 -5.913261705169198e+02 1.414127723755367e+02 -4.391307184322908e+02 + ME 7.821725715727443e+00 + +Event 67 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -5.975437950835898e+02 -2.686137949018985e+01 -4.524598095400128e+02 + 3 7.499999999999999e+02 5.975437950835899e+02 2.686137949018984e+01 4.524598095400128e+02 + ME 8.701992064204690e+00 + +Event 68 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.181330717451524e+02 -5.368392105903471e+02 7.643155066244265e+01 + 3 7.499999999999993e+02 -5.181330717451523e+02 5.368392105903465e+02 -7.643155066244279e+01 + ME 6.709576166570739e+00 + +Event 69 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.851952108095214e+02 3.816438081633558e+02 -6.185068613878011e+02 + 3 7.500000000000002e+02 -1.851952108095216e+02 -3.816438081633560e+02 6.185068613878012e+02 + ME 1.124076284141209e+01 + +Event 70 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.420936051939211e+02 -6.644346036493945e+02 6.323466421748243e+01 + 3 7.500000000000005e+02 -3.420936051939211e+02 6.644346036493947e+02 -6.323466421748294e+01 + ME 6.706536729422492e+00 + +Event 71 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 2.014973205429061e+02 -5.862298917340072e+02 4.221769106092426e+02 + 3 7.499999999999994e+02 -2.014973205429059e+02 5.862298917340071e+02 -4.221769106092423e+02 + ME 7.706257036162022e+00 + +Event 72 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 -5.128995270704379e+02 -5.335011449610840e+02 -1.216988227393154e+02 + 3 7.500000000000008e+02 5.128995270704376e+02 5.335011449610839e+02 1.216988227393155e+02 + ME 6.868877782003888e+00 + +Event 73 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 6.269260124380270e+02 7.328600119139395e+01 -4.050838641046208e+02 + 3 7.500000000000001e+02 -6.269260124380266e+02 -7.328600119139401e+01 4.050838641046201e+02 + ME 8.235262935910448e+00 + +Event 74 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.868628802298931e+02 4.654417523012835e+02 3.815671661692834e+01 + 3 7.500000000000000e+02 -5.868628802298932e+02 -4.654417523012835e+02 -3.815671661692834e+01 + ME 6.706017466715227e+00 + +Event 75 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.396554044798227e+02 3.890541063878782e+02 -4.447320342052833e+01 + 3 7.499999999999997e+02 -6.396554044798223e+02 -3.890541063878784e+02 4.447320342052837e+01 + ME 6.752960586088662e+00 + +Event 76 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 2.541578029035598e+02 6.696747494064765e+02 2.223500421196449e+02 + 3 7.500000000000002e+02 -2.541578029035598e+02 -6.696747494064766e+02 -2.223500421196449e+02 + ME 6.882481865055330e+00 + +Event 77 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -1.794085909550949e+02 -4.035830095427127e+02 -6.061627767357170e+02 + 3 7.500000000000013e+02 1.794085909550950e+02 4.035830095427118e+02 6.061627767357160e+02 + ME 1.100141889163688e+01 + +Event 78 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -2.251897931439635e+01 5.985924556239659e+02 4.513091707917185e+02 + 3 7.500000000000005e+02 2.251897931439640e+01 -5.985924556239657e+02 -4.513091707917187e+02 + ME 7.910129474481908e+00 + +Event 79 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -8.069660169556641e+01 4.506184873965206e+02 -5.940799923337418e+02 + 3 7.499999999999998e+02 8.069660169556640e+01 -4.506184873965205e+02 5.940799923337419e+02 + ME 1.077391892319198e+01 + +Event 80 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 7.302693508383343e+02 -1.702453183034974e+02 1.494010782754770e+01 + 3 7.500000000000000e+02 -7.302693508383343e+02 1.702453183034973e+02 -1.494010782754776e+01 + ME 6.711629387132324e+00 + +Event 81 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.571124181827449e+02 -1.296917603996396e+02 -6.466457825302152e+02 + 3 7.500000000000019e+02 3.571124181827448e+02 1.296917603996393e+02 6.466457825302152e+02 + ME 1.180584509807795e+01 + +Event 82 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.079791544122487e+02 6.603817609425098e+02 2.883411194131627e+02 + 3 7.500000000000001e+02 2.079791544122487e+02 -6.603817609425098e+02 -2.883411194131628e+02 + ME 7.061913502663524e+00 + +Event 83 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999987e+02 2.106645025753153e+02 2.591228168950025e+02 6.715473424257533e+02 + 3 7.499999999999999e+02 -2.106645025753153e+02 -2.591228168950024e+02 -6.715473424257556e+02 + ME 1.015890233672465e+01 + +Event 84 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.829033392465454e+02 5.412230867109213e+02 4.353656750988262e+02 + 3 7.499999999999991e+02 -2.829033392465457e+02 -5.412230867109214e+02 -4.353656750988268e+02 + ME 7.795328252255855e+00 + +Event 85 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.452959441952361e+02 3.191655713577258e+02 -5.842979111568144e+02 + 3 7.499999999999997e+02 3.452959441952359e+02 -3.191655713577259e+02 5.842979111568144e+02 + ME 1.059515431718278e+01 + +Event 86 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.315896816222929e+02 -5.262181855033000e+02 3.151583220585842e+02 + 3 7.500000000000000e+02 4.315896816222929e+02 5.262181855033001e+02 -3.151583220585842e+02 + ME 7.157764038058493e+00 + +Event 87 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.811116980879164e+02 -3.427907013503558e+02 -5.474928389013653e+02 + 3 7.500000000000003e+02 -3.811116980879164e+02 3.427907013503558e+02 5.474928389013653e+02 + ME 9.969739456316118e+00 + +Event 88 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -6.146396045022400e+01 -6.744579332872385e+02 -3.222245766413236e+02 + 3 7.500000000000000e+02 6.146396045022398e+01 6.744579332872386e+02 3.222245766413236e+02 + ME 7.624622481497845e+00 + +Event 89 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -9.261861454486515e+01 7.306822737190313e+02 -1.415104487752591e+02 + 3 7.499999999999998e+02 9.261861454486525e+01 -7.306822737190320e+02 1.415104487752591e+02 + ME 6.911294772499647e+00 + +Event 90 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.139376319099176e+02 3.915039386019012e+02 4.877297437598081e+02 + 3 7.500000000000000e+02 4.139376319099175e+02 -3.915039386019014e+02 -4.877297437598083e+02 + ME 8.202957219152959e+00 + +Event 91 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 7.126395371709457e+00 -7.420663234346179e+02 -1.085669660312691e+02 + 3 7.499999999999997e+02 -7.126395371709590e+00 7.420663234346180e+02 1.085669660312693e+02 + ME 6.843778977860541e+00 + +Event 92 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 -1.726503622084793e+02 -6.521623274545972e+02 -3.276830039508984e+02 + 3 7.500000000000009e+02 1.726503622084785e+02 6.521623274545967e+02 3.276830039508975e+02 + ME 7.657992243729336e+00 + +Event 93 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 4.209821045541698e+02 -6.195820136229016e+02 -3.731214333339759e+01 + 3 7.500000000000000e+02 -4.209821045541699e+02 6.195820136229019e+02 3.731214333339783e+01 + ME 6.745878151442942e+00 + +Event 94 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999987e+02 6.269909815974255e+02 3.639809310785994e+02 1.920942237722172e+02 + 3 7.500000000000003e+02 -6.269909815974263e+02 -3.639809310786015e+02 -1.920942237722174e+02 + ME 6.823623123940109e+00 + +Event 95 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.553133048499316e+02 5.741086360057838e+02 -1.599658353843466e+02 + 3 7.500000000000000e+02 -4.553133048499316e+02 -5.741086360057838e+02 1.599658353843466e+02 + ME 6.955997757933584e+00 + +Event 96 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.565619044698291e+02 6.422192246258138e+02 -2.902248381786659e+02 + 3 7.499999999999999e+02 -2.565619044698290e+02 -6.422192246258138e+02 2.902248381786659e+02 + ME 7.445693406135954e+00 + +Event 97 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 4.039638951579398e+02 -2.118244738005755e+02 -5.953516303059346e+02 + 3 7.499999999999999e+02 -4.039638951579398e+02 2.118244738005755e+02 5.953516303059346e+02 + ME 1.079752230823119e+01 + +Event 98 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -2.854056619662841e+02 -6.558450095995934e+02 2.256345086658882e+02 + 3 7.500000000000000e+02 2.854056619662841e+02 6.558450095995934e+02 -2.256345086658882e+02 + ME 6.889697738088179e+00 + +Event 99 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000020e+02 6.845741170030375e+02 -2.951022215571757e+02 8.229797787022036e+01 + 3 7.499999999999948e+02 -6.845741170030357e+02 2.951022215571742e+02 -8.229797787021865e+01 + ME 6.711546568104289e+00 + +Event 100 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 1.131199432162236e+02 6.658078482508746e+02 3.261959344539691e+02 + 3 7.499999999999997e+02 -1.131199432162238e+02 -6.658078482508746e+02 -3.261959344539690e+02 + ME 7.201543057684679e+00 + +Event 101 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999983e+02 6.281139753924399e+02 1.799537449267307e+01 -4.094496311068951e+02 + 3 7.499999999999986e+02 -6.281139753924424e+02 -1.799537449266959e+01 4.094496311068942e+02 + ME 8.274303591360999e+00 + +Event 102 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.331822864934145e+02 3.520635566235372e+02 -5.009035423770954e+02 + 3 7.499999999999999e+02 -4.331822864934145e+02 -3.520635566235372e+02 5.009035423770955e+02 + ME 9.288007577141197e+00 + +Event 103 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 6.092502234787572e+02 -1.620958235289011e+02 -4.062500574591606e+02 + 3 7.499999999999993e+02 -6.092502234787571e+02 1.620958235289008e+02 4.062500574591607e+02 + ME 8.245616979601218e+00 + +Event 104 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -7.390318022514670e+02 7.866020384803390e+01 -1.007202441991433e+02 + 3 7.500000000000001e+02 7.390318022514666e+02 -7.866020384803386e+01 1.007202441991435e+02 + ME 6.829885419783473e+00 + +Event 105 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.116661021739877e+02 -1.805958333876085e+02 -7.193162215523183e+02 + 3 7.500000000000003e+02 1.116661021739876e+02 1.805958333876086e+02 7.193162215523179e+02 + ME 1.316205600156798e+01 + +Event 106 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.896019143353705e+02 -3.499565434426594e+01 -2.927874722764087e+02 + 3 7.499999999999997e+02 6.896019143353702e+02 3.499565434426605e+01 2.927874722764089e+02 + ME 7.459019026200863e+00 + +Event 107 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -5.876020488404370e+02 4.472601133788147e+02 1.310809794702898e+02 + 3 7.499999999999998e+02 5.876020488404369e+02 -4.472601133788147e+02 -1.310809794702898e+02 + ME 6.743118010396794e+00 + +Event 108 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -1.474590340134895e+02 7.351525828033792e+02 1.750752082190591e+01 + 3 7.500000000000000e+02 1.474590340134895e+02 -7.351525828033792e+02 -1.750752082190613e+01 + ME 6.710720925424265e+00 + +Event 109 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 2.957445751534650e+02 5.717428497601883e+02 -3.848964276469646e+02 + 3 7.499999999999994e+02 -2.957445751534647e+02 -5.717428497601884e+02 3.848964276469649e+02 + ME 8.064389175975579e+00 + +Event 110 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.063365970415883e+00 -3.265074122789853e+02 6.751711970456008e+02 + 3 7.500000000000002e+02 -6.063365970415944e+00 3.265074122789853e+02 -6.751711970456010e+02 + ME 1.019079421803669e+01 + +Event 111 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -7.490455319369723e+02 -3.406507833202954e+01 -1.644267385470490e+01 + 3 7.500000000000014e+02 7.490455319369728e+02 3.406507833202960e+01 1.644267385470367e+01 + ME 6.728562055083565e+00 + +Event 112 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 2.374207045694743e+02 5.497834357859951e+01 7.093016218641318e+02 + 3 7.499999999999997e+02 -2.374207045694743e+02 -5.497834357859953e+01 -7.093016218641318e+02 + ME 1.038164665955649e+01 + +Event 113 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 5.836102630475973e+01 7.112469905859641e+02 2.306983072999552e+02 + 3 7.499999999999999e+02 -5.836102630475965e+01 -7.112469905859641e+02 -2.306983072999552e+02 + ME 6.901152780652538e+00 + +Event 114 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -2.832697985127742e+02 -6.792924150459122e+02 1.442914970177563e+02 + 3 7.500000000000011e+02 2.832697985127735e+02 6.792924150459127e+02 -1.442914970177568e+02 + ME 6.756509728859192e+00 + +Event 115 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -3.877068793675994e+02 -5.687932010880680e+02 2.977543787873256e+02 + 3 7.500000000000000e+02 3.877068793675994e+02 5.687932010880680e+02 -2.977543787873257e+02 + ME 7.093916506611649e+00 + +Event 116 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -2.888638462220085e+02 -2.429234055910374e+02 -6.481094794568893e+02 + 3 7.499999999999995e+02 2.888638462220084e+02 2.429234055910374e+02 6.481094794568891e+02 + ME 1.183568975883744e+01 + +Event 117 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -1.057325810417607e+02 -2.863650344598337e+02 -6.850661926741572e+02 + 3 7.499999999999999e+02 1.057325810417607e+02 2.863650344598337e+02 6.850661926741573e+02 + ME 1.257888595914403e+01 + +Event 118 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.434912620455068e+02 4.133496415655201e+02 -5.231212351989172e+02 + 3 7.500000000000000e+02 3.434912620455068e+02 -4.133496415655201e+02 5.231212351989172e+02 + ME 9.597962463599572e+00 + +Event 119 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999981e+02 5.337372537382134e+02 -1.304062014214278e+00 5.268992725315259e+02 + 3 7.499999999999999e+02 -5.337372537382154e+02 1.304062014214360e+00 -5.268992725315238e+02 + ME 8.566935674870347e+00 + +Event 120 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000025e+02 2.526044577837304e+02 -5.022225052464468e+02 4.964509473571080e+02 + 3 7.499999999999989e+02 -2.526044577837272e+02 5.022225052464484e+02 -4.964509473571086e+02 + ME 8.279575669732093e+00 + +Event 121 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -9.983643901956182e+01 7.430481637446571e+02 2.030053692852352e+01 + 3 7.500000000000000e+02 9.983643901956182e+01 -7.430481637446571e+02 -2.030053692852348e+01 + ME 6.709814025805890e+00 + +Event 122 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.972556630245654e+02 1.216831948271524e+02 -2.480478592466205e+02 + 3 7.499999999999999e+02 -6.972556630245654e+02 -1.216831948271524e+02 2.480478592466205e+02 + ME 7.249152122096383e+00 + +Event 123 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 -6.140782040297673e+02 4.305900507141508e+02 4.093428375274393e-01 + 3 7.499999999999980e+02 6.140782040297667e+02 -4.305900507141529e+02 -4.093428375271969e-01 + ME 6.718125316427420e+00 + +Event 124 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999911e+02 3.588901340872197e+01 -1.799600883903311e+02 -7.272044728293959e+02 + 3 7.499999999999977e+02 -3.588901340872244e+01 1.799600883903318e+02 7.272044728294055e+02 + ME 1.326100757037803e+01 + +Event 125 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 7.215024155075263e+02 1.793419884871220e+02 9.884693006001493e+01 + 3 7.499999999999998e+02 -7.215024155075263e+02 -1.793419884871223e+02 -9.884693006001488e+01 + ME 6.719185656788959e+00 + +Event 126 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 1.281551412895951e+02 -2.336695998906989e+02 -7.010526213116709e+02 + 3 7.500000000000006e+02 -1.281551412895945e+02 2.336695998906985e+02 7.010526213116711e+02 + ME 1.287412953205788e+01 + +Event 127 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -1.957501429351263e+02 4.658358804290750e+02 -5.542371460356527e+02 + 3 7.500000000000007e+02 1.957501429351261e+02 -4.658358804290748e+02 5.542371460356528e+02 + ME 1.007860285681419e+01 + +Event 128 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.318760792775901e+01 -6.964451407927100e+02 2.780112459296673e+02 + 3 7.500000000000002e+02 -1.318760792775897e+01 6.964451407927102e+02 -2.780112459296673e+02 + ME 7.028752582732379e+00 + +Event 129 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 1.550350801573030e+02 -6.509004311564037e+02 3.388107918013039e+02 + 3 7.499999999999999e+02 -1.550350801573030e+02 6.509004311564037e+02 -3.388107918013039e+02 + ME 7.254852487794563e+00 + +Event 130 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -1.645749092425722e+02 -7.293083567438612e+02 5.936682601728498e+01 + 3 7.499999999999986e+02 1.645749092425725e+02 7.293083567438607e+02 -5.936682601728561e+01 + ME 6.706008979498051e+00 + +Event 131 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.289732569218596e+02 -7.242036888606353e+02 -1.462700107322627e+02 + 3 7.500000000000002e+02 -1.289732569218596e+02 7.242036888606353e+02 1.462700107322627e+02 + ME 6.922332000014933e+00 + +Event 132 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999969e+02 6.604855820021785e+01 3.651487935846818e+02 -6.517698570073027e+02 + 3 7.499999999999962e+02 -6.604855820021810e+01 -3.651487935846863e+02 6.517698570073037e+02 + ME 1.191038759405939e+01 + +Event 133 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.812417328859055e+02 -6.435356503900176e+02 5.492365413234045e+01 + 3 7.500000000000001e+02 3.812417328859055e+02 6.435356503900176e+02 -5.492365413234046e+01 + ME 6.705604807235567e+00 + +Event 134 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000010e+02 -6.567102166802748e+02 -3.252202169638837e+02 1.595728729631447e+02 + 3 7.500000000000008e+02 6.567102166802740e+02 3.252202169638840e+02 -1.595728729631455e+02 + ME 6.774711509571564e+00 + +Event 135 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -2.837316099570886e+02 -6.854901002516956e+02 1.099986180280411e+02 + 3 7.499999999999999e+02 2.837316099570885e+02 6.854901002516956e+02 -1.099986180280411e+02 + ME 6.726091843084070e+00 + +Event 136 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -1.445964163493952e+02 -3.365935275091168e+02 6.544437895021098e+02 + 3 7.500000000000001e+02 1.445964163493952e+02 3.365935275091169e+02 -6.544437895021098e+02 + ME 9.990701269524786e+00 + +Event 137 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 2.788569431822102e+02 -2.310081397848976e+02 6.567907159759454e+02 + 3 7.500000000000001e+02 -2.788569431822101e+02 2.310081397848976e+02 -6.567907159759454e+02 + ME 1.001520541801247e+01 + +Event 138 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.519171223259784e+02 5.310903622506274e+02 -3.957238508585245e+02 + 3 7.500000000000000e+02 -3.519171223259784e+02 -5.310903622506274e+02 3.957238508585245e+02 + ME 8.154096533961216e+00 + +Event 139 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 3.739397549265399e+02 -2.618238911224230e+02 -5.950775661399049e+02 + 3 7.500000000000000e+02 -3.739397549265399e+02 2.618238911224230e+02 5.950775661399049e+02 + ME 1.079242835245346e+01 + +Event 140 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 6.980454399514575e+02 1.394595291726036e+02 -2.361855276809563e+02 + 3 7.500000000000002e+02 -6.980454399514578e+02 -1.394595291726037e+02 2.361855276809564e+02 + ME 7.201063249229033e+00 + +Event 141 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 4.653840213895682e+02 -2.880922628106119e+01 5.874416916736161e+02 + 3 7.500000000000005e+02 -4.653840213895682e+02 2.880922628106116e+01 -5.874416916736163e+02 + ME 9.221913954672504e+00 + +Event 142 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.270562981552866e+02 -5.222032422100018e+02 4.881077865527037e+02 + 3 7.499999999999998e+02 -2.270562981552868e+02 5.222032422100018e+02 -4.881077865527037e+02 + ME 8.206225801833849e+00 + +Event 143 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 4.278386633790233e+02 5.011307304374205e+02 3.582206989124281e+02 + 3 7.500000000000003e+02 -4.278386633790234e+02 -5.011307304374206e+02 -3.582206989124283e+02 + ME 7.344041995630785e+00 + +Event 144 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.485631413852261e+02 -2.868202125643110e+02 -2.441311477486855e+02 + 3 7.499999999999998e+02 6.485631413852262e+02 2.868202125643110e+02 2.441311477486854e+02 + ME 7.232945668603777e+00 + +Event 145 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 5.489272678263819e+02 -2.043717898952162e-02 5.110566056896098e+02 + 3 7.500000000000000e+02 -5.489272678263819e+02 2.043717898950950e-02 -5.110566056896097e+02 + ME 8.413582057266058e+00 + +Event 146 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -2.879378412153704e+02 -6.925213285531938e+02 2.451345579835997e+00 + 3 7.500000000000003e+02 2.879378412153705e+02 6.925213285531938e+02 -2.451345579835858e+00 + ME 6.717073075170038e+00 + +Event 147 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 6.952915705099502e+02 1.689784617132080e+02 -2.247574502766133e+02 + 3 7.499999999999999e+02 -6.952915705099505e+02 -1.689784617132079e+02 2.247574502766132e+02 + ME 7.157475465728229e+00 + +Event 148 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -6.957082656749750e+02 1.510461070560446e+02 2.359556793440931e+02 + 3 7.499999999999999e+02 6.957082656749749e+02 -1.510461070560447e+02 -2.359556793440931e+02 + ME 6.913475543846797e+00 + +Event 149 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.906011724554260e+02 -4.023428219257597e+02 4.980471641534193e+02 + 3 7.500000000000000e+02 3.906011724554261e+02 4.023428219257598e+02 -4.980471641534193e+02 + ME 8.293874069871780e+00 + +Event 150 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000017e+02 3.825548542963734e+02 3.607661939729296e+02 -5.347892451616488e+02 + 3 7.500000000000000e+02 -3.825548542963752e+02 -3.607661939729286e+02 5.347892451616495e+02 + ME 9.771743340485465e+00 + +Event 151 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -8.096198305186688e+01 -6.966591351540006e+02 -2.657276927736156e+02 + 3 7.500000000000000e+02 8.096198305186688e+01 6.966591351540006e+02 2.657276927736156e+02 + ME 7.326493934386413e+00 + +Event 152 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 -3.053425729950390e+02 6.849509469742738e+02 1.039775724928324e+01 + 3 7.500000000000000e+02 3.053425729950396e+02 -6.849509469742737e+02 -1.039775724928314e+01 + ME 6.713412436221026e+00 + +Event 153 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -9.852328869759020e+01 -1.550106591050277e+02 7.271621945261152e+02 + 3 7.499999999999997e+02 9.852328869759015e+01 1.550106591050274e+02 -7.271621945261152e+02 + ME 1.035326549865974e+01 + +Event 154 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -3.148528978665797e+01 -7.103339962655654e+02 2.386090741703714e+02 + 3 7.499999999999997e+02 3.148528978665795e+01 7.103339962655651e+02 -2.386090741703716e+02 + ME 6.919863451711640e+00 + +Event 155 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -4.874782580333551e+02 2.540696668148104e+02 -5.102093220918027e+02 + 3 7.499999999999998e+02 4.874782580333551e+02 -2.540696668148104e+02 5.102093220918026e+02 + ME 9.414533796991890e+00 + +Event 156 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999992e+02 5.351140291898049e+02 3.025756173962969e+02 4.296521517710605e+02 + 3 7.499999999999997e+02 -5.351140291898049e+02 -3.025756173962968e+02 -4.296521517710606e+02 + ME 7.756097582470217e+00 + +Event 157 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 1.236396872289430e+01 2.448362281552731e+02 -7.088034661738873e+02 + 3 7.499999999999994e+02 -1.236396872289412e+01 -2.448362281552726e+02 7.088034661738874e+02 + ME 1.300456238570832e+01 + +Event 158 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 4.792285853193343e+02 5.225230422173581e+02 2.445600812985376e+02 + 3 7.499999999999998e+02 -4.792285853193341e+02 -5.225230422173581e+02 -2.445600812985378e+02 + ME 6.934607752233142e+00 + +Event 159 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.994239737611111e+02 4.487043427103747e+02 4.307333877573547e+01 + 3 7.500000000000000e+02 5.994239737611113e+02 -4.487043427103748e+02 -4.307333877573552e+01 + ME 6.705579660517077e+00 + +Event 160 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -1.056348972134614e+00 -5.530529151025125e+02 -5.065879560584343e+02 + 3 7.499999999999999e+02 1.056348972134826e+00 5.530529151025125e+02 5.065879560584341e+02 + ME 9.364737116058938e+00 + +Event 161 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -4.175060175378059e+02 -6.201939914735473e+02 5.956625101372995e+01 + 3 7.499999999999995e+02 4.175060175378056e+02 6.201939914735476e+02 -5.956625101373007e+01 + ME 6.706032182849204e+00 + +Event 162 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000072e+02 -3.532767836310041e+02 -6.048160389091228e+02 2.681288369899752e+02 + 3 7.499999999999935e+02 3.532767836310078e+02 6.048160389091242e+02 -2.681288369899759e+02 + ME 6.998882126198096e+00 + +Event 163 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.455155148577347e+01 -5.419588320322101e+02 5.172879417425158e+02 + 3 7.500000000000001e+02 -3.455155148577347e+01 5.419588320322102e+02 -5.172879417425160e+02 + ME 8.472915145180961e+00 + +Event 164 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 1.203138720765575e+02 6.593334144494570e+02 3.366066261623008e+02 + 3 7.500000000000016e+02 -1.203138720765577e+02 -6.593334144494559e+02 -3.366066261623014e+02 + ME 7.245279798414320e+00 + +Event 165 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -1.816823713406797e+02 -1.602945128408125e+02 7.097867180337723e+02 + 3 7.499999999999998e+02 1.816823713406797e+02 1.602945128408126e+02 -7.097867180337723e+02 + ME 1.038238038894466e+01 + +Event 166 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -4.857732699129262e+01 -2.617520387305627e+02 7.011605476085883e+02 + 3 7.500000000000005e+02 4.857732699129346e+01 2.617520387305621e+02 -7.011605476085887e+02 + ME 1.035922646727294e+01 + +Event 167 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -4.900092704286318e+01 -6.320471173340014e+02 4.007684501288982e+02 + 3 7.499999999999995e+02 4.900092704286321e+01 6.320471173340015e+02 -4.007684501288982e+02 + ME 7.572564742728944e+00 + +Event 168 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.264562441053386e+02 -1.340492719285199e+01 -5.340076132130089e+02 + 3 7.500000000000000e+02 -5.264562441053386e+02 1.340492719285199e+01 5.340076132130089e+02 + ME 9.759861238647328e+00 + +Event 169 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.804167788417345e+02 6.461491047275697e+02 1.656529008013751e+01 + 3 7.500000000000002e+02 3.804167788417345e+02 -6.461491047275698e+02 -1.656529008013748e+01 + ME 6.711045999579444e+00 + +Event 170 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -8.097738208309657e+01 -3.146836244882359e+02 6.759562708266824e+02 + 3 7.500000000000005e+02 8.097738208309649e+01 3.146836244882357e+02 -6.759562708266828e+02 + ME 1.019749138789619e+01 + +Event 171 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000016e+02 -7.540116185339261e+01 -3.719303863809236e+02 -6.469021969955721e+02 + 3 7.500000000000017e+02 7.540116185339285e+01 3.719303863809247e+02 6.469021969955687e+02 + ME 1.181107195061937e+01 + +Event 172 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999990e+02 -4.828016172825363e+02 -5.140078459318596e+02 -2.553400334257387e+02 + 3 7.499999999999997e+02 4.828016172825364e+02 5.140078459318597e+02 2.553400334257382e+02 + ME 7.280209633800780e+00 + +Event 173 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.987927902937172e+01 -2.766666825658829e+01 7.462247930951604e+02 + 3 7.499999999999999e+02 -6.987927902937163e+01 2.766666825658830e+01 -7.462247930951604e+02 + ME 1.015295949221370e+01 + +Event 174 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 4.529896621351460e+02 -2.479499892141399e+02 5.438944464208056e+02 + 3 7.500000000000002e+02 -4.529896621351460e+02 2.479499892141399e+02 -5.438944464208056e+02 + ME 8.740428722501580e+00 + +Event 175 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 1.641260263514685e+02 6.805045353716747e+02 -2.692140873220755e+02 + 3 7.499999999999995e+02 -1.641260263514684e+02 -6.805045353716745e+02 2.692140873220758e+02 + ME 7.342581363432417e+00 + +Event 176 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 -4.449083452207171e+02 3.512022859276665e+02 4.911349292324310e+02 + 3 7.500000000000002e+02 4.449083452207169e+02 -3.512022859276665e+02 -4.911349292324313e+02 + ME 8.232570580250515e+00 + +Event 177 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 6.756184157382538e+02 2.756014812918595e+02 1.734461871159609e+02 + 3 7.499999999999999e+02 -6.756184157382538e+02 -2.756014812918595e+02 -1.734461871159609e+02 + ME 6.793840564450193e+00 + +Event 178 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -7.899028545581652e+01 6.651246644987089e+02 3.374458704430804e+02 + 3 7.500000000000000e+02 7.899028545581672e+01 -6.651246644987090e+02 -3.374458704430803e+02 + ME 7.248911588364097e+00 + +Event 179 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999985e+02 -5.649043925133968e+02 -4.752925559656242e+02 1.322120023399266e+02 + 3 7.499999999999994e+02 5.649043925133972e+02 4.752925559656248e+02 -1.322120023399254e+02 + ME 6.744181199572066e+00 + +Event 180 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.443749249963932e+02 -3.789807589401903e+02 6.045279471629056e+01 + 3 7.499999999999999e+02 -6.443749249963932e+02 3.789807589401904e+02 -6.045279471629056e+01 + ME 6.706140603805416e+00 + +Event 181 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -7.487240721466687e+02 -4.365618810460560e+01 2.530025352630979e+00 + 3 7.500000000000003e+02 7.487240721466687e+02 4.365618810460534e+01 -2.530025352631128e+00 + ME 6.717033445711649e+00 + +Event 182 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -1.736127789510567e+02 -4.849363338101802e+02 5.451562667122455e+02 + 3 7.499999999999997e+02 1.736127789510564e+02 4.849363338101802e+02 -5.451562667122455e+02 + ME 8.753666706450035e+00 + +Event 183 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999927e+02 -1.465394368041592e+02 7.111974597422186e+02 1.876815566786875e+02 + 3 7.499999999999959e+02 1.465394368041588e+02 -7.111974597422145e+02 -1.876815566786932e+02 + ME 6.816143270442155e+00 + +Event 184 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -1.708920160408793e+02 -6.330544605807505e+02 3.640576448755044e+02 + 3 7.500000000000005e+02 1.708920160408793e+02 6.330544605807504e+02 -3.640576448755050e+02 + ME 7.372635144826682e+00 + +Event 185 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000009e+02 -4.460547321795561e+02 -2.478493945403630e+02 5.496415682298783e+02 + 3 7.500000000000003e+02 4.460547321795564e+02 2.478493945403632e+02 -5.496415682298782e+02 + ME 8.801107357866194e+00 + +Event 186 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000016e+02 -2.118512395965089e+01 5.909417855163437e+02 4.613447698344136e+02 + 3 7.499999999999995e+02 2.118512395965117e+01 -5.909417855163440e+02 -4.613447698344144e+02 + ME 7.986507314578602e+00 + +Event 187 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 2.653808167750950e+02 -7.001457156827445e+02 -4.323191990741312e+01 + 3 7.500000000000000e+02 -2.653808167750950e+02 7.001457156827445e+02 4.323191990741302e+01 + ME 6.751690624904062e+00 + +Event 188 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 6.408357091392251e+02 3.883393337910244e+02 -3.197117018143498e+01 + 3 7.500000000000002e+02 -6.408357091392251e+02 -3.883393337910245e+02 3.197117018143515e+01 + ME 6.740978036777086e+00 + +Event 189 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999945e+02 6.049264826448527e+01 2.924911886358718e+01 7.469840216173675e+02 + 3 7.499999999999955e+02 -6.049264826447848e+01 -2.924911886358377e+01 -7.469840216173619e+02 + ME 1.014012989536147e+01 + +Event 190 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.303745399677808e+02 -6.726435165214554e+02 -3.005599811564162e+01 + 3 7.499999999999998e+02 3.303745399677808e+02 6.726435165214554e+02 3.005599811564162e+01 + ME 6.739299930414858e+00 + +Event 191 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 2.507415243158170e+02 -6.861277073704367e+02 1.698748279244853e+02 + 3 7.500000000000024e+02 -2.507415243158163e+02 6.861277073704370e+02 -1.698748279244831e+02 + ME 6.788674205289294e+00 + +Event 192 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -3.280906177080156e+02 -4.243783337877665e+02 -5.241751390360782e+02 + 3 7.500000000000001e+02 3.280906177080155e+02 4.243783337877666e+02 5.241751390360782e+02 + ME 9.613343988054837e+00 + +Event 193 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.030479677918753e+02 3.086657427947369e+02 6.126886553888426e+02 + 3 7.500000000000000e+02 3.030479677918753e+02 -3.086657427947368e+02 -6.126886553888426e+02 + ME 9.516778710827076e+00 + +Event 194 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -2.646170069729470e+02 3.428856457724426e+02 -6.122967201805791e+02 + 3 7.500000000000000e+02 2.646170069729470e+02 -3.428856457724426e+02 6.122967201805791e+02 + ME 1.111953658219990e+01 + +Event 195 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -5.753494363932364e+02 4.203961533313043e+02 -2.339660238287344e+02 + 3 7.499999999999999e+02 5.753494363932361e+02 -4.203961533313043e+02 2.339660238287343e+02 + ME 7.192391167104575e+00 + +Event 196 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 -4.281251117808166e+02 -5.086770881665590e+02 3.470684494981561e+02 + 3 7.499999999999997e+02 4.281251117808166e+02 5.086770881665587e+02 -3.470684494981560e+02 + ME 7.291710264585493e+00 + +Event 197 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -8.339544953992254e+01 -6.784858681092461e+02 -3.085484172250403e+02 + 3 7.499999999999997e+02 8.339544953992269e+01 6.784858681092458e+02 3.085484172250410e+02 + ME 7.544747609232306e+00 + +Event 198 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -8.306003478718614e+01 -8.401139414697989e+00 7.453391519822949e+02 + 3 7.499999999999995e+02 8.306003478718661e+01 8.401139414698106e+00 -7.453391519822942e+02 + ME 1.016738604948644e+01 + +Event 199 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.019646293152730e+02 3.128355820782189e+02 -6.510331689640140e+02 + 3 7.500000000000000e+02 2.019646293152730e+02 -3.128355820782189e+02 6.510331689640140e+02 + ME 1.189534908491987e+01 + +Event 200 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 6.426710110696476e+01 3.020308647655598e+02 -6.834815991994776e+02 + 3 7.499999999999998e+02 -6.426710110696486e+01 -3.020308647655599e+02 6.834815991994777e+02 + ME 1.254822989235959e+01 + +Event 201 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 4.022805093987974e+02 -5.266311217523918e+02 -3.511837885775217e+02 + 3 7.499999999999998e+02 -4.022805093987973e+02 5.266311217523916e+02 3.511837885775217e+02 + ME 7.811934851106916e+00 + +Event 202 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.007651650396935e+02 -3.709039186002584e+02 5.783343312182138e+02 + 3 7.500000000000001e+02 3.007651650396937e+02 3.709039186002585e+02 -5.783343312182138e+02 + ME 9.117543166414862e+00 + +Event 203 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -3.363438797848475e+01 7.118317541034084e+01 -7.458563423902846e+02 + 3 7.500000000000000e+02 3.363438797848460e+01 -7.118317541034091e+01 7.458563423902846e+02 + ME 1.340052505645933e+01 + +Event 204 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -7.528559343153047e+01 -6.321297640200863e+02 -3.965400873323797e+02 + 3 7.499999999999993e+02 7.528559343153123e+01 6.321297640200862e+02 3.965400873323802e+02 + ME 8.161039193344054e+00 + +Event 205 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -2.566959580289875e+02 5.032495686744615e+02 4.933021961845994e+02 + 3 7.500000000000002e+02 2.566959580289877e+02 -5.032495686744614e+02 -4.933021961845996e+02 + ME 8.251619891499795e+00 + +Event 206 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999975e+02 -5.149079042091835e+02 5.440418782066425e+02 3.725969592322714e+01 + 3 7.500000000000002e+02 5.149079042091855e+02 -5.440418782066433e+02 -3.725969592322816e+01 + ME 6.706125653182199e+00 + +Event 207 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 1.433423326606751e+02 7.409449380754353e+01 7.324363328336475e+02 + 3 7.499999999999995e+02 -1.433423326606743e+02 -7.409449380754377e+01 -7.324363328336461e+02 + ME 1.031871628400673e+01 + +Event 208 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 3.842573371112481e+01 7.342148980903339e+02 1.481618925735034e+02 + 3 7.499999999999997e+02 -3.842573371112488e+01 -7.342148980903335e+02 -1.481618925735034e+02 + ME 6.760841228553823e+00 + +Event 209 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 6.901829459970367e+02 -7.313618628183561e+01 2.842509442570098e+02 + 3 7.499999999999997e+02 -6.901829459970354e+02 7.313618628183605e+01 -2.842509442570091e+02 + ME 7.048541958204960e+00 + +Event 210 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 5.714997395800420e+02 4.794362694683780e+02 7.764606350735725e+01 + 3 7.500000000000010e+02 -5.714997395800424e+02 -4.794362694683778e+02 -7.764606350735751e+01 + ME 6.709952662765318e+00 + +Event 211 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999985e+02 -1.281705743857864e+02 6.553011438753649e+01 -7.360557777573463e+02 + 3 7.500000000000011e+02 1.281705743857857e+02 -6.553011438753558e+01 7.360557777573472e+02 + ME 1.334645347867926e+01 + +Event 212 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000023e+02 -3.302472484430320e+02 -5.946768960171274e+02 3.159052773209681e+02 + 3 7.499999999999994e+02 3.302472484430327e+02 5.946768960171283e+02 -3.159052773209689e+02 + ME 7.160644558910863e+00 + +Event 213 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.589381160217068e+02 4.877683676965272e+02 -1.103186381930632e+02 + 3 7.500000000000048e+02 -5.589381160217124e+02 -4.877683676965237e+02 1.103186381930646e+02 + ME 6.846992080427366e+00 + +Event 214 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000039e+02 -5.489994742235593e+02 5.022963490456326e+02 9.379741487743684e+01 + 3 7.499999999999869e+02 5.489994742235626e+02 -5.022963490456337e+02 -9.379741487743767e+01 + ME 6.716526733612570e+00 + +Event 215 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 5.451972808110462e+02 -4.842724140540300e+02 1.753287026773505e+02 + 3 7.499999999999995e+02 -5.451972808110462e+02 4.842724140540300e+02 -1.753287026773504e+02 + ME 6.796632303852332e+00 + +Event 216 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -3.351512110674362e+02 -6.344873209566202e+02 -2.181731084834278e+02 + 3 7.500000000000000e+02 3.351512110674362e+02 6.344873209566204e+02 2.181731084834278e+02 + ME 7.133537822042087e+00 + +Event 217 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000092e+02 2.256146404784501e+02 -3.672666490292720e+02 6.137696982684969e+02 + 3 7.500000000000277e+02 -2.256146404784403e+02 3.672666490292735e+02 -6.137696982684706e+02 + ME 9.529470279328248e+00 + +Event 218 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.145068310337325e+02 3.208524589836873e+02 -4.414027870140238e+02 + 3 7.500000000000002e+02 -5.145068310337325e+02 -3.208524589836873e+02 4.414027870140238e+02 + ME 8.584251506095839e+00 + +Event 219 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.924092255386964e+02 2.606260561841842e+02 1.230590233456330e+02 + 3 7.500000000000002e+02 6.924092255386964e+02 -2.606260561841842e+02 -1.230590233456330e+02 + ME 6.736018565037169e+00 + +Event 220 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000115e+02 1.410851717299374e+02 -1.722511762647995e+00 -7.366084490902481e+02 + 3 7.500000000000023e+02 -1.410851717299434e+02 1.722511762645135e+00 7.366084490902635e+02 + ME 1.335073459249307e+01 + +Event 221 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999995e+02 -4.216972323744533e+02 -5.474264157117908e+02 2.915403292662405e+02 + 3 7.499999999999999e+02 4.216972323744525e+02 5.474264157117910e+02 -2.915403292662400e+02 + ME 7.072596233893424e+00 + +Event 222 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000006e+02 -5.703291911524503e+02 2.159931833959636e+02 4.365450245345482e+02 + 3 7.500000000000002e+02 5.703291911524508e+02 -2.159931833959636e+02 -4.365450245345480e+02 + ME 7.803550027599576e+00 + +Event 223 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000002e+02 -4.653699780465836e+02 5.516351728100433e+02 -2.040328886521882e+02 + 3 7.500000000000001e+02 4.653699780465835e+02 -5.516351728100433e+02 2.040328886521882e+02 + ME 7.084920764438142e+00 + +Event 224 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.340910046878813e+02 -5.773211279348984e+02 4.176143158037027e+02 + 3 7.500000000000000e+02 2.340910046878813e+02 5.773211279348985e+02 -4.176143158037027e+02 + ME 7.676652060550525e+00 + +Event 225 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000009e+02 1.759434290522927e+02 6.827868599275705e+02 2.556286636579899e+02 + 3 7.500000000000017e+02 -1.759434290522938e+02 -6.827868599275712e+02 -2.556286636579905e+02 + ME 6.963601312726175e+00 + +Event 226 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -6.045452037729307e+02 -4.263792434151545e+02 1.233930199803326e+02 + 3 7.500000000000007e+02 6.045452037729308e+02 4.263792434151547e+02 -1.233930199803321e+02 + ME 6.736298806774612e+00 + +Event 227 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 5.413532699307888e+02 4.584643726767907e+02 -2.434071817373498e+02 + 3 7.499999999999997e+02 -5.413532699307889e+02 -4.584643726767907e+02 2.434071817373500e+02 + ME 7.229985849870164e+00 + +Event 228 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 1.451497175256272e+01 -1.575609542562009e+02 -7.331192681200636e+02 + 3 7.499999999999999e+02 -1.451497175256276e+01 1.575609542562010e+02 7.331192681200636e+02 + ME 1.332151398119770e+01 + +Event 229 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999989e+02 -3.976075277594225e+02 5.994840638922486e+02 2.121959259935821e+02 + 3 7.500000000000005e+02 3.976075277594232e+02 -5.994840638922486e+02 -2.121959259935819e+02 + ME 6.861218344473272e+00 + +Event 230 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 3.454994669319923e+02 6.424027082758220e+02 -1.744960708715187e+02 + 3 7.499999999999999e+02 -3.454994669319923e+02 -6.424027082758219e+02 1.744960708715187e+02 + ME 6.994921270876956e+00 + +Event 231 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 5.709700188990096e+01 -9.588670170606456e+01 -7.416506406733004e+02 + 3 7.500000000000001e+02 -5.709700188990102e+01 9.588670170606431e+01 7.416506406733004e+02 + ME 1.338325825898647e+01 + +Event 232 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000014e+02 -6.095841334961025e+02 1.049630507684870e+02 4.241343444749286e+02 + 3 7.500000000000014e+02 6.095841334961026e+02 -1.049630507684888e+02 -4.241343444749290e+02 + ME 7.719146738296482e+00 + +Event 233 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 3.540619742696694e+02 6.247200054612367e+02 2.164833322747417e+02 + 3 7.499999999999999e+02 -3.540619742696694e+02 -6.247200054612367e+02 -2.164833322747417e+02 + ME 6.870005768268533e+00 + +Event 234 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 5.021981849113400e+02 4.737027333003311e+02 -2.930916299308990e+02 + 3 7.500000000000008e+02 -5.021981849113404e+02 -4.737027333003315e+02 2.930916299308989e+02 + ME 7.460611794746191e+00 + +Event 235 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -2.462818585976497e+02 -2.802589959482592e+02 6.506152021861774e+02 + 3 7.499999999999997e+02 2.462818585976496e+02 2.802589959482593e+02 -6.506152021861776e+02 + ME 9.949962111171558e+00 + +Event 236 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999976e+02 -7.345429587235471e+02 -4.314359541629795e+01 -1.452076856237840e+02 + 3 7.500000000000007e+02 7.345429587235451e+02 4.314359541629806e+01 1.452076856237857e+02 + ME 6.919839367434639e+00 + +Event 237 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -2.154628275024254e+02 5.951093556116026e+02 -4.023936192687490e+02 + 3 7.500000000000002e+02 2.154628275024255e+02 -5.951093556116026e+02 4.023936192687491e+02 + ME 8.211583192649293e+00 + +Event 238 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999997e+02 -5.866932570143750e+01 -7.232769026327443e+02 -1.895479684399755e+02 + 3 7.499999999999993e+02 5.866932570143736e+01 7.232769026327437e+02 1.895479684399756e+02 + ME 7.038899087839023e+00 + +Event 239 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000008e+02 -6.560694650821694e+02 1.084032761720732e+02 -3.468740213708120e+02 + 3 7.500000000000002e+02 6.560694650821692e+02 -1.084032761720727e+02 3.468740213708121e+02 + ME 7.782414325579205e+00 + +Event 240 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -4.455880632469494e+02 -5.785873576156609e+02 1.708448053034083e+02 + 3 7.499999999999998e+02 4.455880632469495e+02 5.785873576156609e+02 -1.708448053034080e+02 + ME 6.790060620693855e+00 + +Event 241 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 2.940553859063193e+02 5.948775697686277e+02 3.495026566503573e+02 + 3 7.500000000000001e+02 -2.940553859063193e+02 -5.948775697686276e+02 -3.495026566503572e+02 + ME 7.302879345857822e+00 + +Event 242 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000000e+02 -6.255013042950071e+02 3.893831803342039e+02 1.401030235150751e+02 + 3 7.500000000000000e+02 6.255013042950072e+02 -3.893831803342043e+02 -1.401030235150751e+02 + ME 6.752031966491566e+00 + +Event 243 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 -5.768430196426254e+02 -8.702513188290264e+01 4.713584168234837e+02 + 3 7.499999999999999e+02 5.768430196426253e+02 8.702513188290258e+01 -4.713584168234837e+02 + ME 8.065960198683422e+00 + +Event 244 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -1.570848048302291e+02 6.678520603231959e+02 -3.029818305006274e+02 + 3 7.499999999999997e+02 1.570848048302291e+02 -6.678520603231960e+02 3.029818305006274e+02 + ME 7.513715485806262e+00 + +Event 245 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000026e+02 -3.619595558325996e+02 -3.635357995317505e+02 5.471078526216533e+02 + 3 7.500000000000018e+02 3.619595558326022e+02 3.635357995317535e+02 -5.471078526216538e+02 + ME 8.774235169550527e+00 + +Event 246 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999994e+02 4.171130913584880e+02 6.858303858260207e+01 -6.195264609652622e+02 + 3 7.499999999999999e+02 -4.171130913584882e+02 -6.858303858260196e+01 6.195264609652622e+02 + ME 1.126081387194467e+01 + +Event 247 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000001e+02 2.343959954156196e+02 3.550474070294903e+02 6.176567461744072e+02 + 3 7.500000000000000e+02 -2.343959954156195e+02 -3.550474070294903e+02 -6.176567461744072e+02 + ME 9.575066260706294e+00 + +Event 248 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000005e+02 5.858635267134600e+02 3.455507392677969e+02 -3.160041370905556e+02 + 3 7.499999999999998e+02 -5.858635267134599e+02 -3.455507392677965e+02 3.160041370905557e+02 + ME 7.587640824521511e+00 + +Event 249 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 7.056780166938407e+02 2.349912780195324e+02 -9.642425011268840e+01 + 3 7.500000000000000e+02 -7.056780166938408e+02 -2.349912780195324e+02 9.642425011268843e+01 + ME 6.822620706535901e+00 + +Event 250 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000003e+02 1.004573993267176e+02 5.235762666245009e+01 7.413952993179739e+02 + 3 7.500000000000003e+02 -1.004573993267173e+02 -5.235762666244982e+01 -7.413952993179740e+02 + ME 1.022488250131083e+01 + +Event 251 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000028e+02 -6.475270246624094e+02 1.104657610733045e+02 -3.619476038908660e+02 + 3 7.499999999999985e+02 6.475270246624103e+02 -1.104657610733049e+02 3.619476038908664e+02 + ME 7.888308147189524e+00 + +Event 252 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999999e+02 -5.030430118392509e+02 3.645899530085105e+02 4.201450873269224e+02 + 3 7.499999999999999e+02 5.030430118392509e+02 -3.645899530085105e+02 -4.201450873269224e+02 + ME 7.692997554589848e+00 + +Event 253 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000007e+02 -5.256865986862221e+02 -4.330514916902603e+02 3.140382230024754e+02 + 3 7.499999999999987e+02 5.256865986862222e+02 4.330514916902604e+02 -3.140382230024754e+02 + ME 7.153466587267740e+00 + +Event 254 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.500000000000043e+02 9.702918632485769e+01 6.245997683844794e+02 -4.036836215840012e+02 + 3 7.499999999999783e+02 -9.702918632485488e+01 -6.245997683844738e+02 4.036836215839904e+02 + ME 8.222902211790691e+00 + +Event 255 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.499999999999998e+02 -5.376163199361313e+01 7.337003421695744e+02 1.459229071312400e+02 + 3 7.500000000000000e+02 5.376163199361314e+01 -7.337003421695744e+02 -1.459229071312400e+02 + ME 6.758312705283232e+00 + From 76434697efa42bd9217167643eaa39bc5e0ffe2a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 19:28:40 +0100 Subject: [PATCH 95/96] [susy2] ** COMPLETE SUSY2 ** in CODEGEN, add reference test file for susy_gq_ttq ./CODEGEN/generateAndCompare.sh susy_gq_ttq -c 'import model MSSM_SLHA2; define q = u c d s u~ c~ d~ s~; generate g q > t t~ q' CUDACPP_RUNTEST_DUMPEVENTS=1 ./runTest.exe cp ../../test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_*txt ../../../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/ --- .../dump_CPUTest.Sigma_MSSM_SLHA2_gu_ttxu.txt | 4096 +++++++++++++++++ ...ump_CPUTest.Sigma_MSSM_SLHA2_gux_ttxux.txt | 4096 +++++++++++++++++ 2 files changed, 8192 insertions(+) create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gu_ttxu.txt create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gux_ttxux.txt diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gu_ttxu.txt b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gu_ttxu.txt new file mode 100644 index 0000000000..20061a1a1a --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gu_ttxu.txt @@ -0,0 +1,4096 @@ +Event 0 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.647483690509011e+02 7.527657265342381e+01 -2.528976247704283e+02 -2.163164141117315e+01 + 3 6.252973211776936e+02 -5.721080498766041e+02 -1.578766990348905e+01 2.518727230515587e+02 + 4 6.099543097714056e+02 4.968314772231802e+02 2.686852946739174e+02 -2.302410816403857e+02 + ME 3.646107800222614e-04 + +Event 1 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.542827954151951e+02 1.482213322085297e+02 -1.988618298139057e+02 -5.607271498295619e+01 + 3 6.883656117507994e+02 1.265478873489434e+02 5.602777828023584e+02 3.793700749224231e+02 + 4 5.573515928340057e+02 -2.747692195574731e+02 -3.614159529884527e+02 -3.232973599394666e+02 + ME 7.492227507053585e-04 + +Event 2 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.301460683791101e+02 -3.656995432079240e+02 -2.257802895903974e+02 -1.768459985405174e+01 + 3 5.058528987551352e+02 2.755467101243707e+02 -2.034821274188550e+02 3.722313656043858e+02 + 4 5.640010328657552e+02 9.015283308355326e+01 4.292624170092524e+02 -3.545467657503341e+02 + ME 8.165094921041109e-04 + +Event 3 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.758793342627306e+02 1.455349847705337e+02 4.360940220328824e+02 -4.954335945799966e+02 + 3 3.008019460079605e+02 -1.607139834787174e+02 2.732727402256846e+01 2.527964523704278e+02 + 4 5.233187197293092e+02 1.517899870818368e+01 -4.634212960554508e+02 2.426371422095687e+02 + ME 7.482573940324630e-05 + +Event 4 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.540811678028369e+02 5.414642718170588e+01 -3.497885023717100e+02 -9.467915537920083e+00 + 3 7.415000547748699e+02 1.453779348794835e+00 7.277337852109663e+02 1.422102514562808e+02 + 4 4.044187774222939e+02 -5.560020653050046e+01 -3.779452828392566e+02 -1.327423359183604e+02 + ME 2.125858791535567e-04 + +Event 5 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.747467875786875e+02 2.462969907607520e+02 3.713870243947702e+02 1.636886763636383e+02 + 3 3.438196236093863e+02 -2.056491112573935e+02 2.636029701703988e+02 8.021128807897369e+01 + 4 6.814335888119256e+02 -4.064787950335842e+01 -6.349899945651691e+02 -2.438999644426124e+02 + ME 6.124818116703532e-04 + +Event 6 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.623951200922343e+02 4.644673798421034e+02 3.089047820108763e+02 -7.166700647426805e+01 + 3 2.268243199894468e+02 1.761899852590787e+02 -7.114332369064562e+01 -1.238748914321566e+02 + 4 7.107805599183189e+02 -6.406573651011822e+02 -2.377614583202307e+02 1.955418979064247e+02 + ME 8.277384533280011e-04 + +Event 7 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.922243378496302e+02 2.878585072835455e+02 -1.441537488072182e+02 -3.723465794939189e+02 + 3 2.873990637609374e+02 -5.400981623596619e+01 -8.913204919452848e+01 -2.678369642286231e+02 + 4 7.203765983894325e+02 -2.338486910475794e+02 2.332857980017467e+02 6.401835437225419e+02 + ME 2.040186880955407e-03 + +Event 8 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.353309706037128e+02 -7.529439061162444e+01 -4.917829145606098e+01 -3.230466069128648e+02 + 3 7.169322705461503e+02 -1.597426278178965e+02 -1.460012137440142e+01 6.987567601563110e+02 + 4 4.477367588501367e+02 2.350370184295209e+02 6.377841283046253e+01 -3.757101532434461e+02 + ME 5.255152533554952e-03 + +Event 9 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.557626120875720e+02 2.000882245504951e+02 -5.276260741790070e+01 -1.503174088272976e+02 + 3 7.044202058180884e+02 -6.969679478438196e+02 -1.019614549623776e+02 6.882422911146141e+00 + 4 5.398171820943396e+02 4.968797232933244e+02 1.547240623802783e+02 1.434349859161516e+02 + ME 7.285226448194742e-05 + +Event 10 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.466796552973451e+02 1.172124288883391e+02 -1.804077050554744e+02 2.718475489457261e+02 + 3 5.174471655316497e+02 -1.610456139025785e+02 -4.497410659869823e+02 -1.988689340353917e+02 + 4 6.358731791710056e+02 4.383318501423927e+01 6.301487710424566e+02 -7.297861491033446e+01 + ME 2.106140088827326e-04 + +Event 11 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.730783827248505e+02 -3.059484875398851e+01 3.466457017175527e+02 -4.553235612803232e+02 + 3 4.410994673708889e+02 -3.026218886155177e+02 -1.990641070399050e+01 3.203005892260323e+02 + 4 4.858221499042605e+02 3.332167373695060e+02 -3.267392910135625e+02 1.350229720542916e+02 + ME 5.461308340048999e-05 + +Event 12 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.275003875859171e+02 -1.247450244086003e+02 1.654605359856639e+02 9.390376067217456e+01 + 3 6.138170466352969e+02 3.363961838598331e+02 -2.139358085817026e+01 5.129827374509639e+02 + 4 6.586825657787861e+02 -2.116511594512328e+02 -1.440669551274935e+02 -6.068864981231385e+02 + ME 5.305276638870773e-02 + +Event 13 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.867684047377951e+02 7.055192702127013e+01 -2.028354730671930e+02 1.900429278217245e+02 + 3 6.990707050557397e+02 -5.605742285334718e+02 2.413419117565431e+02 -3.408965629057133e+02 + 4 5.141608902064656e+02 4.900223015122018e+02 -3.850643868935018e+01 1.508536350839886e+02 + ME 7.220326046778678e-05 + +Event 14 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.551549262960331e+02 1.090410064132905e+02 3.205839746298527e+02 1.071027348074892e+02 + 3 5.276349775014137e+02 3.895763694332612e+02 -2.529209653865598e+02 2.503196099590424e+02 + 4 6.172100962025532e+02 -4.986173758465519e+02 -6.766300924329286e+01 -3.574223447665316e+02 + ME 7.555672402921398e-04 + +Event 15 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.846731991828424e+02 7.106081559720656e+01 3.900476102503053e+02 4.297161529048977e+02 + 3 2.829885923647301e+02 -2.767806781033228e+02 5.223342094943638e+01 -2.732525156618248e+01 + 4 6.323382084524277e+02 2.057198625061163e+02 -4.422810311997417e+02 -4.023909013387151e+02 + ME 1.241903961679154e-03 + +Event 16 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.471577506095513e+02 1.666056475215675e+02 -5.784682380714991e+02 -4.425627187781377e+02 + 3 6.589296733908155e+02 -1.235441202519037e+02 5.251239647671504e+02 3.783780998595694e+02 + 4 9.391257599963079e+01 -4.306152726966399e+01 5.334427330434853e+01 6.418461891856477e+01 + ME 3.159636433624256e-05 + +Event 17 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.567490993131759e+02 3.856364495163705e+01 -1.708845728849434e+02 -3.107752047682323e+02 + 3 6.453207560475681e+02 4.468356462873770e+02 2.282834847349607e+02 4.057874246326636e+02 + 4 4.979301446392561e+02 -4.853992912390144e+02 -5.739891185001712e+01 -9.501221986443127e+01 + ME 1.432434905777595e-04 + +Event 18 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.856701782481425e+02 2.509110753153842e+02 -3.498523763974107e+02 -2.247720379690151e+02 + 3 3.014847498930009e+02 -1.059425909901355e+02 -2.435847754696140e+02 -1.426032222348426e+02 + 4 7.128450718588564e+02 -1.449684843252488e+02 5.934371518670247e+02 3.673752602038576e+02 + ME 1.017218336241746e-03 + +Event 19 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.848213503304407e+02 -3.141116763848333e+02 -1.950442390378233e+02 4.531088295091878e+02 + 3 5.769300027107225e+02 5.020221748138873e+02 2.252239828724832e+02 -1.734823378963535e+02 + 4 3.382486469588368e+02 -1.879104984290540e+02 -3.017974383465995e+01 -2.796264916128346e+02 + ME 4.301842267588168e-03 + +Event 20 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.550938429889906e+02 -4.478597170519694e+02 -1.958065402362923e+02 -2.630791652090858e+02 + 3 5.585686897587656e+02 3.351111310173187e+02 -1.360174455686904e+02 4.256744830831254e+02 + 4 3.863374672522434e+02 1.127485860346507e+02 3.318239858049827e+02 -1.625953178740396e+02 + ME 2.751146256989623e-04 + +Event 21 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.296556563991995e+02 -3.477135312394777e+02 -1.376147989324513e+02 -5.065804111325868e+02 + 3 3.137568007204202e+02 1.080474571851863e+02 -2.382188236683312e+02 1.732653140250679e+02 + 4 5.565875428803803e+02 2.396660740542913e+02 3.758336226007825e+02 3.333150971075188e+02 + ME 5.694968712454752e-05 + +Event 22 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.583338925767157e+02 2.471586228668331e+02 -1.597599499756147e+02 -4.744745610949309e+02 + 3 5.378723432497914e+02 9.149532098241647e+00 4.314513680009924e+02 3.210493120152683e+02 + 4 4.037937641734918e+02 -2.563081549650743e+02 -2.716914180253777e+02 1.534252490796626e+02 + ME 4.118041916697490e-05 + +Event 23 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.057340011976823e+02 6.848115528115159e+01 -5.207204912425279e+02 -3.017849923015606e+02 + 3 6.884459352783618e+02 -2.949639632364768e+01 6.680977958792450e+02 1.635026102131438e+02 + 4 2.058200635239559e+02 -3.898475895750392e+01 -1.473773046367171e+02 1.382823820884168e+02 + ME 3.742735875841251e-05 + +Event 24 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.702316790647312e+02 -1.210575128627593e+02 4.313728504035304e+02 -1.427598490831809e+02 + 3 7.180482366151730e+02 1.040047389253586e+02 -7.104588047260975e+02 4.956931953573400e+00 + 4 3.117200843200958e+02 1.705277393740067e+01 2.790859543225672e+02 1.378029171296075e+02 + ME 3.685039512385556e-05 + +Event 25 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.261365010744015e+02 -5.354018140499276e+02 -2.095559720530078e+02 2.479477970595020e+02 + 3 5.483958991041942e+02 5.199465180092641e+02 -9.843995208133502e+01 -1.438862620216537e+02 + 4 3.254675998214044e+02 1.545529604066344e+01 3.079959241343431e+02 -1.040615350378483e+02 + ME 1.642955060031953e-04 + +Event 26 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.635816356180675e+02 1.904702824079147e+02 -2.351549941335565e+02 -3.511853259118595e+02 + 3 3.686385821486526e+02 -2.712527815845713e+02 -6.015354190959190e+01 -2.422764621809818e+02 + 4 6.677797822332795e+02 8.078249917665664e+01 2.953085360431485e+02 5.934617880928414e+02 + ME 3.256831108762987e-04 + +Event 27 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.851713673150520e+02 1.387976072955998e+02 1.520424011317634e+02 -1.973348453858079e+02 + 3 6.747356481771329e+02 2.426633222154767e+02 -4.300238522839811e+02 4.598501858640580e+02 + 4 5.400929845078149e+02 -3.814609295110765e+02 2.779814511522176e+02 -2.625153404782502e+02 + ME 4.337856102642487e-04 + +Event 28 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.977804200471009e+02 -1.803202618401224e+02 -8.082809162516924e+01 -8.277519444290682e+00 + 3 7.197523834069630e+02 3.152541965091956e+02 6.467033971658864e+02 -2.080867841663850e+01 + 4 5.824671965459365e+02 -1.349339346690732e+02 -5.658753055407170e+02 2.908619786092892e+01 + ME 1.299242610389982e-04 + +Event 29 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.123364628491765e+02 -3.746492624245139e+02 3.785128791537566e+02 -3.021950929683376e+02 + 3 4.056577755659300e+02 1.796205570313495e+00 -8.781658530568644e+01 3.960344074293251e+02 + 4 4.820057615848937e+02 3.728530568542006e+02 -2.906962938480702e+02 -9.383931446098750e+01 + ME 5.249715062449717e-04 + +Event 30 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.349194950356051e+02 7.241679607953655e+02 1.425637322816714e+01 1.244354634469207e+02 + 3 7.321421454671269e+02 -7.253765693071589e+02 -2.895970851972086e+01 -9.498573130653320e+01 + 4 3.293835949726733e+01 1.208608511793151e+00 1.470333529155410e+01 -2.944973214038765e+01 + ME 5.155378223163163e-02 + +Event 31 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.718338270585457e+02 -1.344914872264095e+02 -1.021614404532311e+02 3.165350011824393e+01 + 3 6.313115253715935e+02 -2.849940593920691e+02 -7.916450257599642e+01 -5.577325610990745e+02 + 4 6.968546475698608e+02 4.194855466184786e+02 1.813259430292276e+02 5.260790609808306e+02 + ME 4.593512632959087e-04 + +Event 32 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.235176898898733e+02 -4.762113006241283e+02 -2.880822916693120e+01 5.439400065022984e+02 + 3 6.603902828461303e+02 4.672103814637362e+02 1.031050210016799e+02 -4.551913221650266e+02 + 4 1.160920272639969e+02 9.000919160391994e+00 -7.429679183474865e+01 -8.874868433727180e+01 + ME 4.479107681857113e-03 + +Event 33 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.786737271642295e+02 2.009638309376701e+02 4.090184839380261e+02 1.464443769121514e+02 + 3 3.795793219608412e+02 -6.057523839522326e+00 -8.244277697544295e+01 3.704685635647953e+02 + 4 6.417469508749324e+02 -1.949063070981499e+02 -3.265757069625828e+02 -5.169129404769462e+02 + ME 1.329515937732463e-02 + +Event 34 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.621583515140109e+02 -5.051303032557108e+02 -1.429543729176959e+02 4.035605363216953e+02 + 3 3.008522892707525e+02 8.677543723835063e+01 2.726747894692539e+02 -9.290092916351111e+01 + 4 5.369893592152367e+02 4.183548660173603e+02 -1.297204165515579e+02 -3.106596071581844e+02 + ME 6.326360761237585e-04 + +Event 35 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.158114977149371e+02 2.502256147979830e+02 4.233348779616201e+00 5.626659943296694e+02 + 3 1.476397433483021e+02 -1.670550278282843e+01 -6.055370982200890e+01 1.336101351676488e+02 + 4 7.365487589367604e+02 -2.335201120151546e+02 5.632036104239268e+01 -6.962761294973183e+02 + ME 2.119412279505752e+00 + +Event 36 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.182456511154909e+02 -7.463771462544149e+01 -6.667773110518938e+02 2.563475070450520e+02 + 3 4.860008755751821e+02 -7.840660561780857e+01 4.141081959217036e+02 -2.419992919944375e+02 + 4 2.957534733093265e+02 1.530443202432501e+02 2.526691151301903e+02 -1.434821505061439e+01 + ME 8.968600614248642e-05 + +Event 37 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.672182018814331e+02 -2.031706828392723e+00 -5.267408190306547e+02 2.104197478372324e+02 + 3 4.664069288608284e+02 3.712365792892206e+02 2.604523782658950e+02 -1.090109358856581e+02 + 4 4.663748692577388e+02 -3.692048724608279e+02 2.662884407647598e+02 -1.014088119515744e+02 + ME 1.198889160249027e-04 + +Event 38 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.068057345787184e+02 4.883513201966849e+02 -7.570036138649979e+01 -1.124032737511800e+02 + 3 3.871140338254016e+02 -1.153787089711744e+02 -3.599073977747532e+02 -8.373585688177310e+01 + 4 6.060802315958794e+02 -3.729726112255106e+02 4.356077591612531e+02 1.961391306329531e+02 + ME 9.973916186127731e-05 + +Event 39 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.960337392567769e+02 -3.669089247616476e+02 2.651961920161228e+02 -2.027271347192069e+02 + 3 2.837821967046824e+02 -2.822567153069604e+02 -2.935613327724534e+01 -1.303560381865560e+00 + 4 7.201840640385411e+02 6.491656400686079e+02 -2.358400587388775e+02 2.040306951010725e+02 + ME 1.378170965290159e-03 + +Event 40 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.080730228651937e+02 -3.065830270999448e+02 -2.484308296331463e+01 1.728167064871203e+01 + 3 6.842346640746096e+02 4.630487823766367e+02 8.554554725666559e+01 -4.964321303112498e+02 + 4 5.076923130601963e+02 -1.564657552766919e+02 -6.070246429335082e+01 4.791504596625379e+02 + ME 5.404279843345256e-05 + +Event 41 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.602650851118221e+02 -1.258781096038287e+02 -9.817642232798531e+01 1.417706342452912e+01 + 3 7.146392966623014e+02 6.799675591776853e+02 -1.019163870176435e+02 1.948499239342933e+02 + 4 6.250956182258764e+02 -5.540894495738563e+02 2.000928093456288e+02 -2.090269873588226e+02 + ME 4.843572179643119e-04 + +Event 42 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.687893235969910e+02 1.289401357197518e+02 4.788693514682045e+01 9.783209393213438e+01 + 3 7.042017295435161e+02 -1.022058447296739e+02 -6.640064324330017e+02 -2.110675220936915e+02 + 4 6.270089468594927e+02 -2.673429099007782e+01 6.161194972861812e+02 1.132354281615571e+02 + ME 1.884939998160905e-04 + +Event 43 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.729783670130408e+02 -7.983817933050126e+01 9.052957805204312e+01 4.573169538528310e+02 + 3 5.638402597824536e+02 4.785250044669658e+02 7.435095949863266e+01 -2.887933404236804e+02 + 4 4.631813732045056e+02 -3.986868251364647e+02 -1.648805375506758e+02 -1.685236134291506e+02 + ME 6.083802212362751e-04 + +Event 44 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.774791104122977e+02 -1.952605982635783e+01 6.371003613266311e+01 1.644949814321786e+02 + 3 7.194816205691245e+02 -3.678871192485065e+02 2.644831693887217e+01 -6.177486190667771e+02 + 4 6.030392690185776e+02 3.874131790748644e+02 -9.015835307153534e+01 4.532536376345984e+02 + ME 2.168344213081981e-04 + +Event 45 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.477488480180841e+02 -3.787655987618923e+02 1.634662296474455e+02 6.236535517992065e+02 + 3 7.458113398274103e+02 3.819163358711198e+02 -1.661042992235261e+02 -6.186952632673017e+02 + 4 6.439812154506047e+00 -3.150737109227506e+00 2.638069576080606e+00 -4.958288531904773e+00 + ME 9.458422935891954e-02 + +Event 46 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.243146757688279e+02 -4.392587631431587e+00 -2.496903827548322e+02 -2.069188895501946e+02 + 3 5.341608950426614e+02 -2.704482657861201e+02 2.711825143656835e+02 -3.723515022507137e+02 + 4 6.415244291885106e+02 2.748408534175518e+02 -2.149213161085116e+01 5.792703918009084e+02 + ME 1.927561951750774e-04 + +Event 47 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.742198761450970e+02 -3.282965096491569e+02 5.301803926793565e+02 -2.563251730900703e+02 + 3 6.484148720042494e+02 3.527030795571957e+02 -3.975273148506380e+02 3.715029176935213e+02 + 4 1.773652518506535e+02 -2.440656990803884e+01 -1.326530778287185e+02 -1.151777446034509e+02 + ME 1.126482746803783e-03 + +Event 48 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.321401810535270e+02 -1.843482647928687e+02 4.412348098999295e+02 5.543976952635381e+02 + 3 7.293058265076229e+02 2.182722651304251e+02 -4.435200216702997e+02 -5.362221528717154e+02 + 4 3.855399243885008e+01 -3.392400033755636e+01 2.285211770370228e+00 -1.817554239182278e+01 + ME 2.318249807040869e-03 + +Event 49 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.511117284856090e+02 -3.272266866652173e+02 5.199533974843239e+01 1.161835877338140e+02 + 3 7.326526490901412e+02 6.615045961628415e+02 -2.993354007364775e+02 -9.792799058578565e+01 + 4 4.162356224242500e+02 -3.342779094976241e+02 2.473400609880451e+02 -1.825559714802838e+01 + ME 9.670374411388645e-05 + +Event 50 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.322170903075261e+02 2.740692406080843e+02 1.952596610981922e+01 -6.787095515302594e+02 + 3 3.078559130669523e+02 -1.663333363406682e+02 8.625456119089937e+01 2.442716420418761e+02 + 4 4.599269966255218e+02 -1.077359042674160e+02 -1.057805273007185e+02 4.344379094883834e+02 + ME 6.662995363647553e-05 + +Event 51 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.473696038265161e+02 -2.922314643158455e+02 -6.759614889845236e+01 -1.752060888796554e+02 + 3 5.389399151999500e+02 -2.449040872454050e+02 9.346474502284559e+01 4.708954891311221e+02 + 4 6.136904809735342e+02 5.371355515612505e+02 -2.586859612439322e+01 -2.956894002514666e+02 + ME 4.910655284179798e-04 + +Event 52 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.818614816439092e+02 5.970116833066722e+02 3.013730734325877e+02 1.329902280423528e+02 + 3 2.108623144448949e+02 -4.198344769951677e+00 -1.698802183673394e+02 -1.248439063859964e+02 + 4 6.072762039111955e+02 -5.928133385367207e+02 -1.314928550652483e+02 -8.146321656356342e+00 + ME 1.526742583524398e-04 + +Event 53 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.157714002491656e+02 -5.140718537651751e+02 -4.182413977701254e+01 1.003899065692042e+00 + 3 5.148181840855221e+02 2.868792199999327e+02 1.974924151010656e+02 3.791237552236646e+02 + 4 4.694104156653124e+02 2.271926337652422e+02 -1.556682753240530e+02 -3.801276542893567e+02 + ME 3.174536237328426e-03 + +Event 54 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.433410767101755e+02 2.586883950027282e+02 -5.809813083922763e+02 9.710187728524585e+01 + 3 6.928799734080566e+02 -1.579832568796112e+02 6.405510983559769e+02 -2.117031848853748e+02 + 4 1.637789498817686e+02 -1.007051381231171e+02 -5.956978996370076e+01 1.146013076001289e+02 + ME 3.466281331656433e-05 + +Event 55 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.193759752058201e+02 -3.536444481659258e+02 -7.212523476050659e+01 -6.222823703878202e+02 + 3 5.307053661742267e+02 2.409461639849982e+02 1.900944302490854e+02 4.329633233142391e+02 + 4 2.499186586199529e+02 1.126982841809279e+02 -1.179691954885788e+02 1.893190470735813e+02 + ME 3.665852764854121e-05 + +Event 56 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.858864959547015e+02 1.815174721437793e+02 3.218581876578407e+02 -1.112074732396182e+02 + 3 4.484505297447189e+02 -3.244105157450005e+02 2.934585578803474e+02 -9.873079412811626e+01 + 4 6.656629743005794e+02 1.428930436012212e+02 -6.153167455381879e+02 2.099382673677345e+02 + ME 2.342012718477031e-04 + +Event 57 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.284589752749191e+02 3.868194647882292e+02 -1.709996888155516e+02 3.168575336559792e+02 + 3 6.299868555278972e+02 -1.587414880613578e+02 2.327134172236621e+02 -5.634971548731003e+02 + 4 3.415541691971833e+02 -2.280779767268714e+02 -6.171372840811039e+01 2.466396212171209e+02 + ME 2.893626236527393e-05 + +Event 58 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.172037319760957e+02 -2.246119436411400e+02 -2.286037628748728e+01 5.744278237820342e+02 + 3 5.117934503257735e+02 1.262762853074207e+02 3.215736628881853e+02 -3.775939815489577e+02 + 4 3.710028176981306e+02 9.833565833371921e+01 -2.987132866006979e+02 -1.968338422330765e+02 + ME 6.189703659115828e-04 + +Event 59 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.388935626701858e+02 -3.912134623809441e+02 -5.457789630286015e+02 3.082872805076099e+02 + 3 1.936051438730608e+02 1.561492575196544e+02 8.304673385628061e+01 -7.876294246644987e+01 + 4 5.675012934567535e+02 2.350642048612896e+02 4.627322291723209e+02 -2.295243380411600e+02 + ME 3.911264490501042e-04 + +Event 60 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.258141426633659e+02 -5.584991156701968e+02 1.635894950857984e+02 4.337319270970709e+02 + 3 2.789580074371136e+02 2.331554478032953e+02 6.512410160032128e+01 -1.386180308029247e+02 + 4 4.952278498995201e+02 3.253436678669015e+02 -2.287135966861195e+02 -2.951138962941461e+02 + ME 7.105017619505166e-04 + +Event 61 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.906141202026897e+02 4.485275282318680e+02 -2.043613424290570e+02 3.253990429020988e+02 + 3 4.163572165237975e+02 -4.021600557528675e+02 -4.112755461437413e+01 9.964509802161204e+01 + 4 4.930286632735124e+02 -4.636747247900049e+01 2.454888970434311e+02 -4.250441409237108e+02 + ME 5.849340298427127e-03 + +Event 62 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.346180891175762e+02 3.693463141798367e+02 7.549194961263062e+01 -6.305140780380819e+02 + 3 4.420621433230785e+02 -2.806743363126464e+02 3.467380983154045e+01 3.397625382625571e+02 + 4 3.233197675593453e+02 -8.867197786719018e+01 -1.101657594441711e+02 2.907515397755249e+02 + ME 2.959469621620797e-05 + +Event 63 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.451039732729312e+02 -2.415045377667665e+02 1.990362537024482e+02 -5.641092662620229e+02 + 3 3.260870385294104e+02 2.061141051805975e+02 -2.496695602716584e+02 3.892098426606745e+01 + 4 5.288089881976583e+02 3.539043258616898e+01 5.063330656921013e+01 5.251882819959554e+02 + ME 4.918924456107157e-04 + +Event 64 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.275973380665292e+02 -6.064553482667328e+01 4.309976929667101e+02 -2.981980196075213e+02 + 3 5.799838776791828e+02 3.279821268626862e+02 -1.824214634122377e+02 4.421893627315650e+02 + 4 3.924187842542881e+02 -2.673365920360130e+02 -2.485762295544724e+02 -1.439913431240437e+02 + ME 2.191287811753559e-04 + +Event 65 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.480172869826540e+02 2.720879118036236e+02 -5.153900904044359e+02 -2.833154199679406e+02 + 3 7.075023253568394e+02 -3.440299289242928e+02 4.709796137500282e+02 4.004761563708322e+02 + 4 1.444803876605064e+02 7.194201712066916e+01 4.441047665440793e+01 -1.171607364028916e+02 + ME 4.996736404991612e-03 + +Event 66 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.472978185025793e+02 4.857452785131265e+02 -2.223654169683453e+02 -1.189119332799752e+02 + 3 3.203062148499982e+02 1.169702135976477e+02 2.922172461416276e+02 -5.935588816501104e+01 + 4 6.323959666474223e+02 -6.027154921107742e+02 -6.985182917328221e+01 1.782678214449862e+02 + ME 1.324558609432079e-04 + +Event 67 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.264671493042945e+02 1.195959046886509e+02 -2.647539231733029e+02 3.122121220929445e+02 + 3 5.059969655247560e+02 3.777175441887565e+02 -7.608313561896590e+00 -3.366073372596323e+02 + 4 5.675358851709478e+02 -4.973134488774076e+02 2.723622367351999e+02 2.439521516668852e+01 + ME 9.366952747759029e-05 + +Event 68 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.996105691520875e+02 -3.814725562071958e+02 -3.417794545715574e+02 3.117664637712125e+02 + 3 2.164196744806214e+02 1.292759463548889e+02 -1.184749651041616e+02 1.268419798013014e+02 + 4 6.839697563672919e+02 2.521966098523067e+02 4.602544196757190e+02 -4.386084435725138e+02 + ME 2.889708313265865e-03 + +Event 69 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.950546755511077e+02 -1.873718558932053e+02 -4.578972175289679e+02 -1.735101101888632e+01 + 3 4.768584394819692e+02 -1.830244097668608e+02 2.985566003539792e+02 -3.236664843936508e+02 + 4 5.280868849669231e+02 3.703962656600662e+02 1.593406171749887e+02 3.410174954125370e+02 + ME 5.033849050995652e-05 + +Event 70 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.918343395272258e+02 6.895733556028865e+02 -5.391072441382606e+01 -1.473005040127906e+01 + 3 2.169590284692678e+02 -1.127375202028747e+02 1.807969800614662e+02 4.091361110301506e+01 + 4 5.912066320035063e+02 -5.768358354000119e+02 -1.268862556476402e+02 -2.618356070173603e+01 + ME 1.478837776553693e-04 + +Event 71 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.156371334918730e+02 1.547202099034306e+02 -4.807172487652236e+02 1.041836686949964e+02 + 3 3.718518305526426e+02 -8.969821893462726e+01 -7.521366892975189e+01 -3.529460545344468e+02 + 4 6.125110359554843e+02 -6.502199096880338e+01 5.559309176949757e+02 2.487623858394504e+02 + ME 1.065726814958849e-04 + +Event 72 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.110577464974889e+02 5.009520239746098e+01 -1.453533690489527e+02 -1.445968227848547e+02 + 3 7.317124633441163e+02 -4.429659627226336e+02 5.264774879404380e+02 2.490095170354977e+02 + 4 5.572297901583944e+02 3.928707603251725e+02 -3.811241188914850e+02 -1.044126942506430e+02 + ME 1.971612401727150e-04 + +Event 73 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.932257450488245e+02 3.105005764664298e+01 -2.932679039283982e+02 2.601082794045340e+02 + 3 5.658879124646471e+02 3.645905401293643e+02 4.244364556305354e+02 8.459646951004228e+01 + 4 5.408863424865280e+02 -3.956405977760073e+02 -1.311685517021372e+02 -3.447047489145762e+02 + ME 9.146138593039113e-04 + +Event 74 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.374854102925440e+02 7.785209805930555e+01 4.289805712042689e+01 1.048858692406466e+02 + 3 6.381281910764945e+02 -1.004137270491618e+02 -1.591026937267357e+02 6.097630724433484e+02 + 4 7.243863986309617e+02 2.256162898985645e+01 1.162046366063089e+02 -7.146489416839951e+02 + ME 1.412913725690509e+01 + +Event 75 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.936883054156938e+02 -3.438525101293580e+00 -2.706855443967301e+02 5.283780053968293e+02 + 3 5.912298912592890e+02 1.109657062166288e+02 4.832067437414102e+02 -3.221034603433170e+02 + 4 3.150818033250173e+02 -1.075271811153352e+02 -2.125211993446804e+02 -2.062745450535123e+02 + ME 1.405029651713649e-03 + +Event 76 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.619486867997674e+02 2.801967015359573e+01 2.136411519593738e+02 6.258980909300585e+02 + 3 1.201252731414032e+02 2.274423842261747e+01 -8.754996679960183e+01 7.904292618103446e+01 + 4 7.179260400588299e+02 -5.076390857621330e+01 -1.260911851597719e+02 -7.049410171110932e+02 + ME 5.927486212021591e+00 + +Event 77 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.456676259451604e+02 -7.346624001550108e+02 6.511229493320700e+01 -1.097804865615983e+02 + 3 1.284204120828029e+02 1.251494694834492e+02 2.867183268690426e+01 2.708973588335758e+00 + 4 6.259119619720370e+02 6.095129306715618e+02 -9.378412762011116e+01 1.070715129732624e+02 + ME 1.483732393179742e-04 + +Event 78 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.040158920877625e+02 6.911264613612160e+02 -6.659640240533207e+01 -1.163937709034253e+02 + 3 5.185438503615325e+02 -4.976050220224221e+02 -1.270913363611936e+02 7.158742227342900e+01 + 4 2.774402575507043e+02 -1.935214393387938e+02 1.936877387665258e+02 4.480634862999637e+01 + ME 4.953923467294661e-05 + +Event 79 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.777589592768841e+02 1.742725197144059e+02 -4.776543849198210e+01 6.532264221831092e+02 + 3 5.725002211294491e+02 -1.786302554544233e+02 -1.627852110918317e+02 -5.189881598643107e+02 + 4 2.497408195936666e+02 4.357735740017461e+00 2.105506495838138e+02 -1.342382623187985e+02 + ME 9.127390566077918e-04 + +Event 80 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.240819586861880e+02 4.679310297228965e+02 -4.118464023828053e+02 -3.002304821964348e+01 + 3 6.688675489057649e+02 -5.494372353172420e+02 3.251429131208653e+02 1.994607943266771e+02 + 4 2.070504924080468e+02 8.150620559434545e+01 8.670348926194001e+01 -1.694377461070337e+02 + ME 3.583887166058892e-03 + +Event 81 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.198056748722776e+02 1.034797897616987e+02 -2.885605608993972e+02 4.197888462474007e+02 + 3 5.672098642055398e+02 -4.160331805498524e+02 2.087659545613753e+01 -3.849773895903518e+02 + 4 4.129844609221831e+02 3.125533907881537e+02 2.676839654432596e+02 -3.481145665704891e+01 + ME 1.012247657869651e-04 + +Event 82 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.057598609140512e+02 6.385349666266646e+01 -2.765433460911305e+01 1.936364870179370e+02 + 3 6.235840147705877e+02 4.654039114453889e+02 -3.828889383639961e+02 -1.601633028106900e+02 + 4 6.706561243153623e+02 -5.292574081080551e+02 4.105432729731098e+02 -3.347318420724695e+01 + ME 7.262284565292408e-04 + +Event 83 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.583322583736492e+02 1.865539504254553e+02 -1.926584839569474e+02 6.012334775737429e+02 + 3 3.620902826842561e+02 -3.107067244571256e+02 -1.177956631152976e+01 -1.855584705935048e+02 + 4 4.795774589420946e+02 1.241527740316703e+02 2.044380502684771e+02 -4.156750069802382e+02 + ME 8.397477163775882e-03 + +Event 84 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.849329564663157e+02 -2.622178945286149e+02 4.068620488841210e+02 -2.941124332559830e+01 + 3 4.737588937677758e+02 6.014532316188536e+01 -1.333934272225748e+02 4.505954095412365e+02 + 4 5.413081497659073e+02 2.020725713667294e+02 -2.734686216615458e+02 -4.211841662156386e+02 + ME 5.093534588509739e-03 + +Event 85 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.085742632080858e+02 -2.174614026040271e+02 -5.283468657604089e+02 -4.190914152061854e+02 + 3 5.315764222715956e+02 8.528530557199831e+00 3.820092234108130e+02 3.695533927738616e+02 + 4 2.598493145203189e+02 2.089328720468272e+02 1.463376423495959e+02 4.953802243232386e+01 + ME 5.680595784967736e-05 + +Event 86 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.724500140939190e+02 1.231518677708316e+02 -1.121928207497680e+01 1.201946443701656e+02 + 3 7.028475062724231e+02 -6.467096040851285e+01 -4.553168759141600e+02 -5.315061866629339e+02 + 4 6.247024796336580e+02 -5.848090736231880e+01 4.665361579891369e+02 4.113115422927684e+02 + ME 1.251324938348285e-04 + +Event 87 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.942099203196797e+02 -7.751148196958455e+01 -1.356691819650310e+02 -1.153400900745028e+02 + 3 7.314670447251598e+02 1.724617634710876e+02 7.020747158546046e+02 1.113196793791551e+02 + 4 5.743230349551608e+02 -9.495028150150301e+01 -5.664055338895736e+02 4.020410695347638e+00 + ME 1.365800031389091e-04 + +Event 88 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.382497024023744e+02 2.632142028760094e+02 -5.613974181649784e+02 1.513733956108635e+02 + 3 3.997044228265544e+02 -5.264940326118349e+01 3.435187961344461e+02 1.974500004195773e+02 + 4 4.620458747710724e+02 -2.105647996148253e+02 2.178786220305324e+02 -3.488233960304407e+02 + ME 1.847332155740200e-03 + +Event 89 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.419006640093282e+02 -8.677155154367875e+01 6.457545216231645e+01 -9.185046144153738e+01 + 3 7.131224514048052e+02 5.460003286026869e+02 -4.154556538506973e+02 -1.944836022569670e+02 + 4 6.449768845858667e+02 -4.592287770590081e+02 3.508802016883808e+02 2.863340636985044e+02 + ME 1.266963986841731e-04 + +Event 90 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.730615760623939e+02 -6.017783679015005e+01 -5.202921970507185e+02 -2.325386583054728e+02 + 3 5.389913703864468e+02 -6.302812531165209e+01 2.446311215742109e+02 4.761247390423042e+02 + 4 3.879470535511588e+02 1.232059621018019e+02 2.756610754765077e+02 -2.435860807368315e+02 + ME 1.075109237682244e-03 + +Event 91 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.546745139784348e+02 -1.470341619195493e+02 -1.726383255301703e+02 -3.940886669878754e+02 + 3 5.110976540119646e+02 -2.482119727393536e+02 -1.865817698532448e+02 4.059542728975802e+02 + 4 5.342278320096004e+02 3.952461346589030e+02 3.592200953834151e+02 -1.186560590970475e+01 + ME 9.777351898357252e-05 + +Event 92 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.683728375977241e+02 -1.148152650923627e+02 3.458291789782991e+02 5.603051703379153e+02 + 3 2.872567998557088e+02 1.635098024620329e+02 7.847331657016400e+01 -2.227620976482501e+02 + 4 5.443703625465666e+02 -4.869453736967034e+01 -4.243024955484631e+02 -3.375430726896653e+02 + ME 8.047351750864682e-04 + +Event 93 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.666948073002088e+02 5.408074886689032e+01 5.639942928586390e+02 -1.134525653745258e+01 + 3 6.168025492529713e+02 2.439040545997395e+02 -5.541969602989467e+02 1.175666879272316e+02 + 4 3.165026434468199e+02 -2.979848034666298e+02 -9.797332559692304e+00 -1.062214313897791e+02 + ME 1.679404659428770e-04 + +Event 94 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.964349376711386e+02 8.445930034540564e+01 -2.409007074648562e+02 -4.257712097695705e+02 + 3 5.660980232871289e+02 1.373833465612049e+02 5.210669225216058e+02 1.734417778711397e+02 + 4 4.374670390417325e+02 -2.218426469066105e+02 -2.801662150567495e+02 2.523294318984308e+02 + ME 3.881415827463177e-05 + +Event 95 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.117074025057369e+02 -3.227984571262281e+02 4.276971164854591e+02 -4.684055501468923e+02 + 3 1.264078228725326e+02 8.675876182178399e+01 5.074873328843476e+01 7.665781760618941e+01 + 4 6.618847746217319e+02 2.360396953044438e+02 -4.784458497738943e+02 3.917477325407025e+02 + ME 1.949219053467063e-04 + +Event 96 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.329769441659942e+02 -9.642859092211873e+01 6.903981466332599e+02 -2.265107649915409e+02 + 3 3.937873938465681e+02 -4.837693103302090e+01 -3.847118583018797e+02 6.873841850241250e+01 + 4 3.732356619874388e+02 1.448055219551397e+02 -3.056862883313803e+02 1.577723464891279e+02 + ME 2.935171699513264e-05 + +Event 97 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.394989963266854e+01 6.003767577498499e+00 -2.078495220615400e+01 2.616364312804199e+01 + 3 7.377311980366452e+02 -5.308290258162607e+02 4.681853362634530e+02 2.080152802450354e+02 + 4 7.283189023306865e+02 5.248252582387622e+02 -4.474003840572990e+02 -2.341789233730774e+02 + ME 2.227857560908627e-02 + +Event 98 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.496912687496081e+02 -2.485814905959505e+02 -5.435228288348339e-01 -2.350907922099247e+01 + 3 7.458289852530974e+02 7.373315781279123e+02 9.801365830907574e+01 -5.473885205171281e+01 + 4 5.044797459972944e+02 -4.887500875319617e+02 -9.747013548024090e+01 7.824793127270527e+01 + ME 7.753687953350025e-05 + +Event 99 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.698125854886769e+02 8.336002034290718e+01 8.774494220182724e+01 -1.191144253093525e+02 + 3 6.496622934125945e+02 5.714329899004553e+02 -6.230613627727956e+01 3.027265745152471e+02 + 4 6.805251210987283e+02 -6.547930102433625e+02 -2.543880592454770e+01 -1.836121492058946e+02 + ME 6.436625879222816e-04 + +Event 100 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.141460480129781e+02 -5.842473718080512e+02 -5.092222124447422e+01 1.823110095657221e+02 + 3 3.909476383151781e+02 2.539115798088024e+02 -2.930333502072385e+02 -5.000421191795164e+01 + 4 4.949063136718438e+02 3.303357919992488e+02 3.439555714517127e+02 -1.323067976477706e+02 + ME 1.509230189250195e-04 + +Event 101 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.469346538870476e+02 3.524232024688499e+02 -1.488240016505349e+02 -6.415299525912138e+02 + 3 6.502268999047171e+02 -2.777200960400716e+02 1.351761574712158e+02 5.721835160737410e+02 + 4 1.028384462082358e+02 -7.470310642877821e+01 1.364784417931911e+01 6.934643651747267e+01 + ME 6.428576508840639e-05 + +Event 102 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.426790432885583e+02 -3.141071077544728e+02 6.615000409077074e+02 1.238005738162371e+02 + 3 6.735764515788642e+01 -4.139700837311953e+00 -5.533298776898177e+01 -3.818606686673834e+01 + 4 6.899633115535552e+02 3.182468085917849e+02 -6.061670531387255e+02 -8.561450694949879e+01 + ME 5.946355391052463e-04 + +Event 103 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.837874798175255e+02 -2.731724972668680e+02 1.247027290420595e+02 -3.793103501549070e+02 + 3 4.466406321977811e+02 -2.904538080082218e+02 -1.536665846758872e+02 3.025078850172423e+02 + 4 5.695718879846933e+02 5.636263052750896e+02 2.896385563382781e+01 7.680246513766477e+01 + ME 8.764589092091150e-05 + +Event 104 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.788466572679498e+02 3.572346730226224e+02 -3.682137844992379e+02 2.680773207965347e+02 + 3 2.925711988065158e+02 2.155069407513812e+02 1.697995838195863e+02 -1.016010147279926e+02 + 4 6.285821439255348e+02 -5.727416137740034e+02 1.984142006796517e+02 -1.664763060685422e+02 + ME 2.758719024653138e-04 + +Event 105 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.361125455083113e+02 2.619004058447622e+02 4.338373361330957e+01 -2.061496357605195e+02 + 3 5.299016201311088e+02 2.892532450564946e+02 2.091058919093095e+02 3.916669672191839e+02 + 4 6.339858343605798e+02 -5.511536509012566e+02 -2.524896255226191e+02 -1.855173314586643e+02 + ME 3.039285293896498e-04 + +Event 106 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.578050478863485e+02 -2.265838270225943e+02 2.740910124726658e+02 -3.947579646386072e+01 + 3 5.202885196186892e+02 1.412729374205232e+02 1.631578432376887e+02 4.734148487210871e+02 + 4 6.219064324949621e+02 8.531088960207101e+01 -4.372488557103545e+02 -4.339390522572265e+02 + ME 1.929017964992398e-03 + +Event 107 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.409822745993890e+02 9.278463733038998e+01 5.102180459532772e+02 -1.540466750365500e+02 + 3 2.501852297905710e+02 1.682301834486208e+02 1.474652503315490e+02 1.120056004263085e+02 + 4 7.088324956100399e+02 -2.610148207790107e+02 -6.576832962848262e+02 4.204107461024152e+01 + ME 7.133155139467854e-04 + +Event 108 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.835202199428555e+02 6.670011709444186e+02 6.653656309718585e+01 1.337243986739828e+02 + 3 2.377887385005082e+02 -1.098327419601477e+02 7.667443498831059e+01 -1.964720946353502e+02 + 4 5.786910415566365e+02 -5.571684289842709e+02 -1.432109980854965e+02 6.274769596136723e+01 + ME 1.049589479206707e-04 + +Event 109 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.978180281189351e+02 4.291222314737004e+02 2.249703559956600e+02 3.501840146583367e+02 + 3 3.585061336071062e+02 -3.227227650115257e+02 1.541688059097761e+02 2.467071262824851e+01 + 4 5.436758382739590e+02 -1.063994664621748e+02 -3.791391619054360e+02 -3.748547272865852e+02 + ME 1.152230206382152e-03 + +Event 110 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.073952645543156e+01 -4.753982451958468e+01 4.872856968801237e+01 -1.922426029646691e+01 + 3 7.438039776014969e+02 1.707202332282495e+02 -7.225114374584515e+02 4.556513803361385e+01 + 4 6.854564959430718e+02 -1.231804087086648e+02 6.737828677704391e+02 -2.634087773714689e+01 + ME 5.628571157985040e-04 + +Event 111 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.206822291802365e+02 -3.873336848644894e+02 2.415505427333673e+02 -2.504714268307115e+02 + 3 5.478000561519709e+02 4.687653961676167e+02 -2.245690260344170e+02 -1.729527606656598e+02 + 4 4.315177146677930e+02 -8.143171130312749e+01 -1.698151669895030e+01 4.234241874963712e+02 + ME 1.131019547165861e-04 + +Event 112 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.610471238372960e+02 2.563298943277285e+02 9.635756626046441e+01 -2.352981732387216e+02 + 3 6.139063356201011e+02 1.031778254919422e+02 -4.257030126280928e+02 4.301305270271112e+02 + 4 5.250465405426033e+02 -3.595077198196709e+02 3.293454463676283e+02 -1.948323537883896e+02 + ME 2.442343844239902e-04 + +Event 113 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.886653054136124e+02 3.035646198144377e+02 3.278619896967805e+02 -3.832517176826292e+02 + 3 5.420023902452333e+02 -3.658357535838290e+02 -3.990519958595696e+02 2.623541560166928e+01 + 4 3.693323043411537e+02 6.227113376939163e+01 7.119000616278893e+01 3.570163020809600e+02 + ME 7.975596703757943e-05 + +Event 114 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.165204340356855e+02 2.346362244736888e+01 6.298471388966840e+00 5.159487827839334e+02 + 3 5.932916594323345e+02 3.608814360715945e+02 -5.336137507463695e+01 -4.678804824963537e+02 + 4 3.901879065319799e+02 -3.843450585189634e+02 4.706290368567026e+01 -4.806830028757967e+01 + ME 5.324614747564868e-04 + +Event 115 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.432307281524775e+02 2.250327918244370e+02 4.870559856477669e+02 -8.506664127290338e+01 + 3 4.265243530840494e+02 2.057819224248363e+02 -2.472237669715339e+02 2.801021835354204e+02 + 4 5.302449187634724e+02 -4.308147142492733e+02 -2.398322186762331e+02 -1.950355422625171e+02 + ME 2.335655038271802e-04 + +Event 116 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.402635748890413e+02 -4.240500842615081e+02 -5.733358735035191e+01 -1.035683405941509e+02 + 3 4.399967684638557e+02 1.183617589007454e+02 -1.041572505293867e+02 -4.107784286579766e+02 + 4 6.197396566471035e+02 3.056883253607625e+02 1.614908378797389e+02 5.143467692521278e+02 + ME 1.289945162035232e-04 + +Event 117 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.074085311587982e+02 -4.270248480828711e+01 -3.034838508096459e+02 2.395944736750828e+01 + 3 5.360984061023379e+02 3.510554986169303e+02 -1.596589010508530e+02 -3.723849798683070e+02 + 4 6.564930627388640e+02 -3.083530138086433e+02 4.631427518604987e+02 3.484255325007987e+02 + ME 1.780648198094132e-04 + +Event 118 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.403602961735895e+02 4.471526113902057e+02 -1.804334130868148e+02 -2.439007487679596e+02 + 3 5.654623567965704e+02 -5.534570111367971e+02 -1.157195831079004e+02 6.480112868522362e+00 + 4 3.941773470298400e+02 1.063043997465926e+02 2.961529961947154e+02 2.374206358994369e+02 + ME 3.186787057162162e-05 + +Event 119 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 8.009099446659006e+01 5.775399043490317e+01 -2.629604726664822e+01 4.886268393818208e+01 + 3 7.131140611332346e+02 2.472685400460708e+02 -2.870014097539109e+02 -6.041689532644715e+02 + 4 7.067949444001754e+02 -3.050225304809738e+02 3.132974570205592e+02 5.553062693262893e+02 + ME 7.594993920622219e-04 + +Event 120 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.007248873753321e+02 2.708997263130530e+02 -3.880896283797751e+02 1.634784128397387e+02 + 3 7.413897277398675e+02 -4.257033276374028e+02 5.921425482134987e+02 -1.334264135464211e+02 + 4 2.578853848848011e+02 1.548036013243502e+02 -2.040529198337238e+02 -3.005199929331748e+01 + ME 1.118063892588403e-04 + +Event 121 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.732265116821120e+02 -1.149395375629033e+02 4.260916136383034e+02 3.658189076403450e+02 + 3 4.323948798659246e+02 -2.148488009071912e+01 -4.178027098651984e+02 1.092914804138530e+02 + 4 4.943786084519636e+02 1.364244176536225e+02 -8.288903773105277e+00 -4.751103880541979e+02 + ME 8.098779643821331e-02 + +Event 122 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.423360304412702e+02 2.648046119434483e+02 2.369247279710451e+01 -2.156644197927059e+02 + 3 6.059487982275790e+02 2.457729689670163e+01 -4.569077875801422e+02 3.972469964635579e+02 + 4 5.517151713311509e+02 -2.893819088401499e+02 4.332153147830377e+02 -1.815825766708520e+02 + ME 2.289560436833749e-04 + +Event 123 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.430133297276667e+02 -4.205671322284506e+01 3.498095937953862e+01 1.321377229770997e+02 + 3 7.140350670908592e+02 -2.955397919833875e+01 -6.570980288365158e+02 -2.778395577453973e+02 + 4 6.429516031814726e+02 7.161069242118353e+01 6.221170694569765e+02 1.457018347682965e+02 + ME 6.121407388405227e-04 + +Event 124 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.053457283343441e+02 5.458657819531910e+02 -1.853964251366731e+01 -2.610177782464908e+02 + 3 7.499633671623128e+02 -6.784114238502394e+02 2.145325921506609e+01 3.189713933003629e+02 + 4 1.446909045033435e+02 1.325456418970486e+02 -2.913616701398675e+00 -5.795361505387171e+01 + ME 4.325459019938280e-04 + +Event 125 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.695439244882122e+02 9.058534244088483e+01 6.586171675820721e+02 7.941529525294386e+01 + 3 9.341516463500352e+01 3.490868167113007e+01 5.232133368429144e+01 6.906703243419068e+01 + 4 7.370409108767839e+02 -1.254940241120153e+02 -7.109385012663632e+02 -1.484823276871339e+02 + ME 1.072533298958066e-02 + +Event 126 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.465564354211969e+02 -2.094351601488127e+02 -1.930091683601272e+02 -5.804477571728034e+02 + 3 1.356182567235448e+02 -2.832094442380729e+01 9.735247446175228e+01 -9.007070211700794e+01 + 4 7.178253078552585e+02 2.377561045726200e+02 9.565669389837490e+01 6.705184592898115e+02 + ME 1.775792866913826e-03 + +Event 127 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.508388003927652e+02 -3.846405138087859e+02 7.756355374444067e+01 2.220162025777267e+02 + 3 6.162879941073577e+02 2.174727303224461e+02 1.334711143222092e+02 -5.609830344035004e+02 + 4 4.328732054998775e+02 1.671677834863399e+02 -2.110346680666500e+02 3.389668318257735e+02 + ME 3.456676365045885e-05 + +Event 128 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.468963146802860e+02 5.701805835528933e+02 -3.440982003215340e+02 -3.381488363986430e+02 + 3 1.196664332518720e+02 -9.337643239636880e+01 2.398139841985227e+01 7.089280393650263e+01 + 4 6.334372520678422e+02 -4.768041511565245e+02 3.201168019016818e+02 2.672560324621405e+02 + ME 1.802061068491473e-04 + +Event 129 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.378966182438206e+02 -4.256397208622688e+02 4.624364030548156e+01 9.190104474357972e+01 + 3 7.127537996732576e+02 5.790589826349545e+02 -1.369827771626341e+02 -3.923574802896586e+02 + 4 3.493495820829216e+02 -1.534192617726859e+02 9.073913685715252e+01 3.004564355460789e+02 + ME 2.389894965110232e-05 + +Event 130 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.322026526626456e+02 5.905875735566585e+02 -2.387291116192750e+01 -2.243136110600485e+02 + 3 5.268087771404593e+02 -3.287250458747471e+02 1.913681034684307e+02 3.644798771698753e+02 + 4 3.409885701968954e+02 -2.618625276819114e+02 -1.674951923065031e+02 -1.401662661098268e+02 + ME 2.700010384626911e-04 + +Event 131 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.691964685177018e+02 -2.641651354044939e+02 4.065264362900751e+01 -3.210735842607329e+01 + 3 5.382709487855665e+02 -3.022535437819009e+02 -4.307865739991412e+02 1.131429946566680e+02 + 4 6.925325826967321e+02 5.664186791863948e+02 3.901339303701334e+02 -8.103563623059483e+01 + ME 5.499127874095715e-04 + +Event 132 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.376388194981168e+02 -2.491804956023666e+01 3.114513197621116e+01 1.317327453336230e+02 + 3 7.332494677489979e+02 -3.054807357444666e+02 -6.882601889638179e+00 -6.665500220046780e+02 + 4 6.291117127528853e+02 3.303987853047033e+02 -2.426253008657300e+01 5.348172766710550e+02 + ME 3.773516208278825e-04 + +Event 133 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.818916885738672e+02 -3.437736592641007e+02 -2.113522447259726e+02 -4.192228966514222e+02 + 3 7.075583625851592e+02 3.695171106849944e+02 9.875952986414086e+01 5.952667441040354e+02 + 4 2.105499488409736e+02 -2.574345142089369e+01 1.125927148618317e+02 -1.760438474526132e+02 + ME 6.646426354302590e-03 + +Event 134 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.039051474789597e+02 -1.767404282002262e+02 5.832845063404939e+02 3.521710697233706e+02 + 3 6.740856043500102e+02 9.540039380435485e+01 -5.203258634262523e+02 -4.177932056695244e+02 + 4 1.220092481710302e+02 8.134003439587137e+01 -6.295864291424152e+01 6.562213594615409e+01 + ME 5.063962311856080e-05 + +Event 135 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.491379873081084e+02 -6.603965492909806e+02 -9.243924572685609e+01 -3.413782470545816e+02 + 3 4.360367703469753e+02 3.763875731093294e+02 3.833030381995055e+01 2.167746473012021e+02 + 4 3.148252423449159e+02 2.840089761816512e+02 5.410894190690560e+01 1.246035997533796e+02 + ME 2.984535980913852e-05 + +Event 136 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.907976432034610e+02 -8.965778913807026e+01 -5.375684903631193e+02 -4.244796613161183e+02 + 3 4.317447428217262e+02 2.541758793770707e+02 2.501815833403359e+02 2.433255445990286e+02 + 4 3.774576139748128e+02 -1.645180902390004e+02 2.873869070227833e+02 1.811541167170898e+02 + ME 2.997979671416653e-05 + +Event 137 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.927917878715716e+02 -5.453882061843875e+02 -2.239274061847311e+02 6.172783069514816e+01 + 3 3.718333194205910e+02 2.859809174201714e+02 -2.363544177495510e+02 2.472896101988848e+01 + 4 5.353748927078368e+02 2.594072887642159e+02 4.602818239342820e+02 -8.645679171503689e+01 + ME 1.234034799175695e-04 + +Event 138 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.164849493482387e+02 2.012854405109472e+01 -2.573298799707043e+01 -1.118096528381494e+02 + 3 7.481698498358139e+02 -1.044692284663333e+02 -4.003634472873117e+00 7.408294509656059e+02 + 4 6.353452008159477e+02 8.434068441523856e+01 2.973662246994371e+01 -6.290197981274564e+02 + ME 3.559608571178405e+00 + +Event 139 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.415587822283577e+02 -2.468214832259765e+02 1.926082427237748e+02 1.365416492148350e+02 + 3 5.828887331044928e+02 -1.023403009989268e+02 -5.561813319045077e+02 1.412376154306548e+02 + 4 5.755524846671491e+02 3.491617842249035e+02 3.635730891807333e+02 -2.777792646454897e+02 + ME 4.257981101039480e-04 + +Event 140 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.395392082109441e+02 -3.037880820376849e+02 -2.455930383243060e+02 -2.014735126343028e+02 + 3 4.709796125547877e+02 -2.826270024952004e+02 2.984919122515593e+02 2.298833426397907e+02 + 4 5.894811792342678e+02 5.864150845328854e+02 -5.289887392725339e+01 -2.840983000548779e+01 + ME 1.257615107455672e-04 + +Event 141 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.025838986653695e+02 -2.680006525137058e+02 -6.218827689980461e+01 -1.259574698062632e+02 + 3 5.104624598690774e+02 -2.829910827131053e+02 4.173533268753468e+02 -7.939880721102661e+01 + 4 6.869536414655532e+02 5.509917352268112e+02 -3.551650499755422e+02 2.053562770172897e+02 + ME 3.793210112402596e-04 + +Event 142 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.390011511178413e+02 -3.153925512561954e+02 3.992377088505193e+01 -3.027468279160259e+02 + 3 4.597282536099518e+02 2.984856708041211e+02 -2.221794712617382e+02 -2.699863960308454e+02 + 4 6.012705952722067e+02 1.690688045207420e+01 1.822557003766862e+02 5.727332239468712e+02 + ME 1.665826835277582e-04 + +Event 143 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.103308443495007e+02 -3.626595603160224e+02 2.462759922459803e+02 5.589240443825270e+02 + 3 3.424564807343298e+02 4.507572778536915e+01 -2.357842367637252e+02 -2.442343416788665e+02 + 4 4.472126749161696e+02 3.175838325306533e+02 -1.049175548225538e+01 -3.146897027036604e+02 + ME 1.284828007996448e-03 + +Event 144 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.893886390440550e+02 -2.470805413393664e+02 1.331686162420118e+02 6.296618309717111e+02 + 3 7.132719020730981e+02 2.482972988978648e+02 -2.304803220538649e+02 -6.276815106349291e+02 + 4 9.733945888284487e+01 -1.216757558499173e+00 9.731170581185297e+01 -1.980320336781243e+00 + ME 3.990515310822803e-04 + +Event 145 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.784954309743687e+02 2.391836032855265e+02 1.115572896135231e+01 -2.931305935912624e+02 + 3 7.389406222827197e+02 -4.231861417520661e+02 1.513250860114714e+02 5.865555822189356e+02 + 4 3.825639467429115e+02 1.840025384665395e+02 -1.624808149728235e+02 -2.934249886276729e+02 + ME 2.226506122500389e-03 + +Event 146 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.681255842987410e+02 -3.253195724522379e+01 1.754808059398437e+02 -4.327698247100132e+02 + 3 2.875849079819392e+02 2.091841587061404e+01 1.879781824316579e+02 -2.166372592748876e+02 + 4 7.442895077193195e+02 1.161354137460974e+01 -3.634589883715017e+02 6.494070839849009e+02 + ME 5.368006648148185e-02 + +Event 147 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.442136391928777e+02 -1.784444843977844e+02 -1.666832492802189e+02 -3.816014311599316e+00 + 3 5.551361515401285e+02 1.378338123621512e+02 -5.199472642306259e+02 1.372327560591401e+02 + 4 7.006502092669938e+02 4.061067203563306e+01 6.866305135108448e+02 -1.334167417475408e+02 + ME 7.671437073741464e-04 + +Event 148 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.547263863263726e+02 3.928375677411887e+02 5.145105706241225e+01 2.231759855356057e+02 + 3 7.397285466814292e+02 -5.611511356388266e+02 -1.533645573573770e+02 -4.569322031694095e+02 + 4 3.055450669921979e+02 1.683135678976379e+02 1.019135002949646e+02 2.337562176338038e+02 + ME 2.272731853639484e-05 + +Event 149 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.343018799311635e+02 9.853424545130945e+01 1.924850318874441e+02 -9.021023174733594e+01 + 3 7.291173748950660e+02 3.429747374294526e+01 -5.990516617369192e+02 4.142136359886766e+02 + 4 5.365807451737705e+02 -1.328317191942546e+02 4.065666298494750e+02 -3.240034042413406e+02 + ME 8.684588967972684e-04 + +Event 150 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.707648023587807e+02 -8.969278865174961e+01 -3.008719699078221e+02 3.507859183712496e+02 + 3 6.876639918976695e+02 3.906111988928598e+02 4.609284537794546e+02 -3.284046551871671e+02 + 4 3.415712057435500e+02 -3.009184102411105e+02 -1.600564838716326e+02 -2.238126318408256e+01 + ME 1.157923151125950e-04 + +Event 151 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.503034458278050e+02 -1.575298496674959e+02 -3.658248853789648e+01 -6.298735108350153e+02 + 3 6.998690336552311e+02 1.302751858829804e+02 -1.019415103826456e+02 6.800389464387811e+02 + 4 1.498275205169628e+02 2.725466378451583e+01 1.385239989205421e+02 -5.016543560376589e+01 + ME 6.795483587696179e-04 + +Event 152 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.401192382353394e+02 1.493701961830190e+02 6.288419447382045e+02 3.605867993093738e+02 + 3 7.332111095478889e+02 -1.230079111936445e+02 -6.287602831147090e+02 -3.565502647954900e+02 + 4 2.666965221677111e+01 -2.636228498937447e+01 -8.166162349551351e-02 -4.036534513883709e+00 + ME 8.308702592914706e-04 + +Event 153 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.645797071775899e+02 7.941901905692946e+01 3.691428696980725e+02 -4.197337333594241e+02 + 3 6.079979027943974e+02 1.021455738177839e+02 -5.566920170809548e+02 2.220849604771994e+02 + 4 3.274223900280123e+02 -1.815645928747133e+02 1.875491473828823e+02 1.976487728822249e+02 + ME 3.317209995588655e-05 + +Event 154 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.022174885419887e+02 -5.152457849782368e+02 -1.493252664732706e+02 -2.736597328082223e+02 + 3 3.617627670199851e+02 1.925398333816265e+02 -2.626238171638091e+02 1.575736108034646e+02 + 4 5.360197444380262e+02 3.227059515966102e+02 4.119490836370798e+02 1.160861220047577e+02 + ME 6.319636534361052e-05 + +Event 155 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.202229507100909e+02 -2.107861924791834e+02 -3.212541876154507e+02 4.868690137883070e+02 + 3 2.943040328093192e+02 2.940980302320594e+02 1.073731199058898e+01 2.433613089266564e+00 + 4 5.854730164805901e+02 -8.331183775287643e+01 3.105168756248618e+02 -4.893026268775735e+02 + ME 5.899295977736313e-03 + +Event 156 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.945486805149832e+02 4.540818864859257e+02 -1.431706201593250e+02 -1.337542944644701e+02 + 3 5.997303202813281e+02 -3.624214233270367e+02 -5.726286247273347e+01 4.743923835389624e+02 + 4 4.057209992036886e+02 -9.166046315888885e+01 2.004334826320584e+02 -3.406380890744924e+02 + ME 4.697248099742586e-03 + +Event 157 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.617003083190188e+02 3.118400043328062e+02 3.404502064148865e+02 -4.079626411035580e+00 + 3 5.720097526413111e+02 -4.999240316044800e+01 -4.329264075474301e+02 -3.705005295422581e+02 + 4 4.662899390396694e+02 -2.618476011723578e+02 9.247620113254364e+01 3.745801559532937e+02 + ME 3.829047994896814e-05 + +Event 158 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.784877363061536e+02 -5.707102180762959e+02 -3.102223423027389e+02 -1.959529373021938e+02 + 3 5.650909444059712e+02 5.525284805868615e+02 7.765167789879931e+01 8.950011457818250e+01 + 4 2.564213192878751e+02 1.818173748943441e+01 2.325706644039396e+02 1.064528227240113e+02 + ME 3.301654157248055e-05 + +Event 159 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.369491563274252e+02 2.154713482252002e+02 -2.912667909729743e+02 3.962955349875316e+02 + 3 6.066564496499102e+02 -4.020061311781470e+01 5.572389608252350e+02 -2.364332868806716e+02 + 4 3.563943940226648e+02 -1.752707351073854e+02 -2.659721698522608e+02 -1.598622481068599e+02 + ME 3.275663125831437e-04 + +Event 160 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.492474755438521e+02 3.490068395973683e+02 1.460348644657109e+02 -5.276270735801971e+02 + 3 2.857818814470014e+02 -2.550253586192556e+02 1.227259509083861e+02 3.964456076362121e+01 + 4 5.649706430091474e+02 -9.398148097811274e+01 -2.687608153740975e+02 4.879825128165766e+02 + ME 6.601297418086642e-05 + +Event 161 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.770282049439583e+02 -2.863253153105185e+02 -4.911270786072976e+02 -3.676672364525181e+02 + 3 1.598243093356544e+02 -7.505362471426162e+01 1.299195075310523e+02 -5.506073768810753e+01 + 4 6.631474857203876e+02 3.613789400247800e+02 3.612075710762454e+02 4.227279741406256e+02 + ME 1.483508416229404e-04 + +Event 162 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.178592782584630e+02 -3.271131571456630e+02 3.943743741889438e+02 -7.512700901574513e+01 + 3 3.730686930366257e+02 -2.885924195736569e+01 -1.360208443078026e+02 -3.461874113706257e+02 + 4 6.090720287049107e+02 3.559723991030290e+02 -2.583535298811413e+02 4.213144203863708e+02 + ME 9.800737736465887e-05 + +Event 163 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.388642316037673e+02 3.152159924116781e+02 3.539969933522671e+01 -4.356149670486711e+02 + 3 5.364171791816749e+02 -5.299694218906361e+02 3.369785517714305e+01 7.576448071880543e+01 + 4 4.247185892145582e+02 2.147534294789580e+02 -6.909755451236975e+01 3.598504863298658e+02 + ME 4.058053122847328e-05 + +Event 164 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.862697092177667e+02 4.132218376422068e+02 1.310202162324327e+02 -5.320221138485150e+02 + 3 4.476895523579006e+02 -2.769046850483522e+02 1.374187337517142e+02 3.238299280529300e+02 + 4 3.660407384243330e+02 -1.363171525938544e+02 -2.684389499841469e+02 2.081921857955847e+02 + ME 3.221465661535458e-05 + +Event 165 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.382444910715276e+02 -2.158277263671036e+02 -9.471372817531808e+00 -1.004446273032522e+02 + 3 7.304591383576045e+02 4.619003715882295e+02 -1.223345688256176e+02 5.524969256086772e+02 + 4 5.312963705708671e+02 -2.460726452211260e+02 1.318059416431495e+02 -4.520522983054249e+02 + ME 7.043062289382798e-03 + +Event 166 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.131352071380649e+02 -7.633553084455029e+01 -1.899581415396244e+02 5.929087379418958e+01 + 3 7.305557876753161e+02 8.980971292745940e+01 7.136333043711877e+02 1.279589045828712e+02 + 4 5.563090051866194e+02 -1.347418208290915e+01 -5.236751628315633e+02 -1.872497783770607e+02 + ME 3.548453752490718e-04 + +Event 167 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.122964103002419e+02 -3.405127102276982e+02 6.366431608201745e+01 2.235761145061386e+02 + 3 4.697083356610920e+02 -2.521100678451879e+02 -2.856113063438231e+01 -3.952855880214881e+02 + 4 6.179952540386658e+02 5.926227780728861e+02 -3.510318544763516e+01 1.717094735153495e+02 + ME 1.082644208257133e-04 + +Event 168 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.156643283953479e+02 -3.999734570317169e+02 4.816586825103863e+02 3.467009924560656e+02 + 3 6.192344221355603e+02 2.722545660880235e+02 -4.999454120042315e+02 -2.436869012025524e+02 + 4 1.651012494690918e+02 1.277188909436935e+02 1.828672949384506e+01 -1.030140912535133e+02 + ME 1.010769340912540e-03 + +Event 169 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.626022684949455e+02 7.511110909567984e+01 -2.030941161665286e+02 -2.908461902563517e+02 + 3 5.580565590514409e+02 -2.529981754432838e+02 -3.439969378312538e+02 3.592842232626200e+02 + 4 5.793411724536144e+02 1.778870663476035e+02 5.470910539977823e+02 -6.843803300626824e+01 + ME 1.474225101402201e-04 + +Event 170 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.602909342483499e+02 4.699653539595538e+02 -3.020118498241595e+02 3.520021683086901e+02 + 3 1.039297502933439e+02 3.247420585022844e+01 -9.851348423194941e+01 6.473976746580496e+00 + 4 7.357793154583059e+02 -5.024395598097821e+02 4.005253340561090e+02 -3.584761450552707e+02 + ME 1.667220498064669e-02 + +Event 171 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.506693011949598e+02 -3.657300520509279e+01 -1.244227366169959e+02 -7.669834565089050e+01 + 3 6.344013325830558e+02 -2.026333084464632e+02 -4.956100871165361e+02 3.402578943089166e+02 + 4 7.149293662219821e+02 2.392063136515565e+02 6.200328237335322e+02 -2.635595486580258e+02 + ME 2.217870824657313e-03 + +Event 172 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.352445157558219e+02 -2.018352690102653e+02 3.892440882325301e+02 -3.069825004886508e+02 + 3 6.716112180685399e+02 2.825227203806541e+02 -5.978593235713690e+02 1.175022124175020e+02 + 4 2.931442661756387e+02 -8.068745137038918e+01 2.086152353388394e+02 1.894802880711482e+02 + ME 3.229846906505831e-05 + +Event 173 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.571348515648596e+02 -2.769863586381786e+02 5.805753619381593e+02 1.343019708712702e+02 + 3 5.332990408103323e+02 1.871824832342877e+02 -4.782426732337678e+02 1.437168410371091e+02 + 4 3.095661076248081e+02 8.980387540389081e+01 -1.023326887043916e+02 -2.780188119083794e+02 + ME 9.982771396756747e-03 + +Event 174 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.091496911716728e+02 -4.752584064243671e+02 3.135726231883978e+01 -3.797492797588730e+02 + 3 6.417481529658016e+02 3.309293137608123e+02 9.015643604119191e+01 5.424004960996682e+02 + 4 2.491021558625255e+02 1.443290926635548e+02 -1.215136983600317e+02 -1.626512163407953e+02 + ME 1.304483496737769e-03 + +Event 175 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.399801778396888e+02 1.966672297646827e+02 2.343185748302534e+02 -4.449667388535756e+02 + 3 6.987953575798325e+02 -1.857207036318897e+02 -9.664246188148672e+01 6.666955876403316e+02 + 4 2.612244645804785e+02 -1.094652613279310e+01 -1.376761129487668e+02 -2.217288487867561e+02 + ME 9.483839580625748e-03 + +Event 176 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.615757321243974e+02 -4.129469954321283e+02 4.686878756164517e+02 -2.179194886871010e+02 + 3 1.607981401590111e+02 -6.355407199259609e+01 7.929314438200188e+00 1.474925346731048e+02 + 4 6.776261277165925e+02 4.765010674247243e+02 -4.766171900546520e+02 7.042695401399618e+01 + ME 6.547021449193765e-04 + +Event 177 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.314334067424885e+02 -3.493619040652742e+02 -2.026482683689240e+01 -2.523299055494342e+02 + 3 4.840006500668402e+02 -1.846595828310068e+02 -1.450727057198389e+02 4.232155216776995e+02 + 4 5.845659431906719e+02 5.340214868962810e+02 1.653375325567312e+02 -1.708856161282654e+02 + ME 2.279478197300691e-04 + +Event 178 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.528135981327367e+02 -2.544528544607913e+02 1.436928116455423e+02 3.458992272209776e+02 + 3 3.053350882587862e+02 -1.380299578048219e+02 2.072032295570572e+02 1.767599177741536e+02 + 4 7.418513136084765e+02 3.924828122656130e+02 -3.508960412025995e+02 -5.226591449951311e+02 + ME 7.508613036866492e-02 + +Event 179 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.433145319259943e+02 -2.538538580850882e+02 -6.778753511348521e+02 -1.689962142519080e+02 + 3 1.647945947160298e+02 1.009041857568576e+02 1.171651165877689e+02 5.699069397138987e+01 + 4 5.918908733579761e+02 1.529496723282306e+02 5.607102345470832e+02 1.120055202805181e+02 + ME 1.172516302841432e-04 + +Event 180 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.396120216689869e+02 1.204528233788652e+02 -1.081248155319048e+02 1.766750195544081e+02 + 3 5.541470271917009e+02 2.767127195685323e+02 2.999096875483203e+02 3.749175614572561e+02 + 4 7.062409511393136e+02 -3.971655429473977e+02 -1.917848720164147e+02 -5.515925810116631e+02 + ME 1.350441514636773e-02 + +Event 181 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.165494222755782e+02 1.336973493521793e+02 -1.495065670853883e+02 -8.164837697364385e+01 + 3 6.960869932595207e+02 -2.848973600545249e+02 2.209041937252092e+01 6.347303441548928e+02 + 4 5.873635844649011e+02 1.512000107023455e+02 1.274161477128675e+02 -5.530819671812490e+02 + ME 6.197861987937880e-02 + +Event 182 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.472681881349898e+02 4.279258056181361e+02 3.994050733201775e+02 -2.762448183472868e+02 + 3 5.337197582091034e+02 -3.479343829022644e+02 -4.034091782989213e+02 -3.254965992745415e+01 + 4 3.190120536559072e+02 -7.999142271587159e+01 4.004104978744047e+00 3.087944782747408e+02 + ME 5.663270932434246e-05 + +Event 183 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.165307808531155e+02 -3.276949594572818e+02 8.808524820164888e+01 -5.147496540405800e+02 + 3 2.975460412740736e+02 -1.030095950018340e+02 -2.375020297789283e+02 1.466814775843214e+02 + 4 5.859231778728107e+02 4.307045544591156e+02 1.494167815772794e+02 3.680681764562588e+02 + ME 7.184865322254869e-05 + +Event 184 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.645337360463251e+02 -3.940276919793660e+02 3.776398996283964e+02 1.443212503288767e+02 + 3 5.368100353438222e+02 2.392766596964612e+02 -1.719264331693737e+02 -4.487237410122138e+02 + 4 3.986562286098530e+02 1.547510322829050e+02 -2.057134664590229e+02 3.044024906833371e+02 + ME 3.009111742803267e-05 + +Event 185 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.347397779710934e+02 2.522092504724421e+02 -1.599825720327363e+02 5.600809373302327e+02 + 3 4.566768168089408e+02 -3.359958684022406e+02 -1.272903681003782e+02 -2.818823400219341e+02 + 4 4.085834052199662e+02 8.378661792979844e+01 2.872729401331146e+02 -2.781985973082987e+02 + ME 1.184232976975434e-03 + +Event 186 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.089823220133233e+02 -5.197119220861887e+02 4.248734840868308e+02 -2.281183322067745e+02 + 3 5.364076825758044e+02 3.588264146200085e+02 -3.973752875032956e+02 3.270606945152316e+01 + 4 2.546099954108726e+02 1.608855074661802e+02 -2.749819658353517e+01 1.954122627552516e+02 + ME 2.025460632333281e-05 + +Event 187 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.835105223217562e+02 -2.128653471696258e+02 1.375287019182911e+02 -4.117725407538515e+02 + 3 7.240136612790379e+02 4.407273454759851e+02 -4.896543389042275e+01 5.723264583716988e+02 + 4 2.924758163992054e+02 -2.278619983063593e+02 -8.856326802786832e+01 -1.605539176178473e+02 + ME 5.493588676089964e-04 + +Event 188 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.611118500396009e+02 3.502021063704276e+02 -2.011693879247277e+02 -5.234102027267808e+02 + 3 3.072944371702249e+02 -6.894916504330921e+01 -1.599953986835476e+02 2.531350551695447e+02 + 4 5.315937127901743e+02 -2.812529413271185e+02 3.611647866082753e+02 2.702751475572363e+02 + ME 6.939798437832852e-05 + +Event 189 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.498478362545704e+02 6.780504955298834e+02 -3.199144947524264e+02 -1.319162971889923e+01 + 3 3.253008430749361e+02 -2.985087551774363e+02 1.291384938207140e+02 6.034152914782609e+00 + 4 4.248513206704935e+02 -3.795417403524470e+02 1.907760009317124e+02 7.157476804116659e+00 + ME 7.764812313983123e-05 + +Event 190 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.938867893347995e+02 3.689671478502748e+02 -1.218724623869293e+02 3.048516153777389e+02 + 3 5.264063001598521e+02 6.631942569346465e+01 1.276367949726207e+02 -5.063735530147588e+02 + 4 4.797069105053494e+02 -4.352865735437401e+02 -5.764332585691415e+00 2.015219376370201e+02 + ME 4.187923244808115e-05 + +Event 191 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.681793141805986e+02 -3.225132888415707e+02 1.579589482507472e+02 -8.117977937027922e+01 + 3 5.431126642386394e+02 4.058413736814013e+01 9.147123993851423e+01 5.338139246166098e+02 + 4 5.887080215807622e+02 2.819291514734306e+02 -2.494301881892614e+02 -4.526341452463305e+02 + ME 4.873506189038712e-03 + +Event 192 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.054165399887860e+02 1.497087111729465e+02 8.905021611535379e+01 5.798159601983524e+02 + 3 2.106656439489221e+02 1.451894976721945e+02 -1.487249448604451e+02 3.436443048222167e+01 + 4 6.839178160622916e+02 -2.948982088451411e+02 5.967472874509132e+01 -6.141803906805740e+02 + ME 4.322642129226900e-02 + +Event 193 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.753169163933055e+02 -1.695475157411122e+02 -2.139406274107579e+02 3.581134319495643e+01 + 3 5.760219428901974e+02 -3.264616044953138e+02 1.527507522369444e+02 -4.493231656306969e+02 + 4 6.486611407164975e+02 4.960091202364259e+02 6.118987517381341e+01 4.135118224357404e+02 + ME 1.518972796370837e-04 + +Event 194 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.445934948105151e+02 -2.970257025567896e+02 -8.183019525038446e+01 1.543509890854414e+02 + 3 7.485441862377920e+02 6.623797851941251e+02 1.083400559332055e+02 -3.314119056355291e+02 + 4 4.068623189516927e+02 -3.653540826373358e+02 -2.650986068282092e+01 1.770609165500877e+02 + ME 3.814122353255388e-05 + +Event 195 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.012122274303650e+02 -5.190018365965079e+01 1.322177369426911e+02 -1.425173724194239e+02 + 3 7.122630330184552e+02 -3.054768058087830e+02 -2.528097616133815e+02 5.916838461125125e+02 + 4 5.865247395511842e+02 3.573769894684374e+02 1.205920246706905e+02 -4.491664736930889e+02 + ME 3.076151768306831e-03 + +Event 196 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.490485793345990e+02 3.485190427929747e+02 -2.661098616642628e+01 -2.819059396826193e+02 + 3 5.531554978829223e+02 -3.330165694254378e+02 4.416170126965179e+02 7.442003978758297e+00 + 4 4.977959227824787e+02 -1.550247336753695e+01 -4.150060265300916e+02 2.744639357038610e+02 + ME 4.598644016648316e-05 + +Event 197 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.951249254444254e+02 -2.278358800090240e+02 3.101157211704545e+02 -8.968142489336995e+01 + 3 3.607080640108545e+02 -2.889948719219028e+02 2.155030307719242e+02 -1.227661082778766e+01 + 4 7.441670105447205e+02 5.168307519309260e+02 -5.256187519423793e+02 1.019580357211576e+02 + ME 3.392572035091148e-02 + +Event 198 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.750236904637999e+02 1.183014344420310e+02 -1.005952209347265e+02 -3.413621838211424e+02 + 3 4.381296266085965e+02 -2.726825461625328e+02 1.003845461170282e+02 -3.279096546785175e+02 + 4 6.868466829276034e+02 1.543811117205018e+02 2.106748176981258e-01 6.692718384996599e+02 + ME 9.860557000175199e-04 + +Event 199 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.454478562244572e+02 -2.058455361543722e+02 -1.131056012155069e+02 -7.126982772660263e+01 + 3 5.321797086694488e+02 -9.806778012582419e+01 -4.820333037417012e+02 -2.030808875905193e+02 + 4 7.223724351060941e+02 3.039133162801963e+02 5.951389049572082e+02 2.743507153171220e+02 + ME 1.587721829582838e-03 + +Event 200 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.952431318363245e+02 3.031309873729304e+02 9.337877017948551e+01 2.358159092128122e+02 + 3 6.094031244332665e+02 -7.796753338981907e+01 -5.315426896439308e+02 -2.876727322709445e+02 + 4 4.953537437304095e+02 -2.251634539831113e+02 4.381639194644453e+02 5.185682305813225e+01 + ME 6.908008481734390e-05 + +Event 201 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.497938633639727e+02 3.771120671245743e+02 3.553445817627055e+02 -3.921081252746437e+02 + 3 3.369790646193911e+02 -2.140351778515324e+02 1.061239955238162e+02 2.376584318047305e+02 + 4 5.132270720166355e+02 -1.630768892730419e+02 -4.614685772865218e+02 1.544496934699135e+02 + ME 6.041436922818793e-05 + +Event 202 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.267802742470180e+02 6.523432021666289e+02 -1.481957728499301e+02 2.840702844913056e+02 + 3 3.546086620137576e+02 -3.102429173963679e+02 -5.939291787501398e+01 -1.611493614224695e+02 + 4 4.186110637392243e+02 -3.421002847702609e+02 2.075886907249440e+02 -1.229209230688361e+02 + ME 1.798527608658209e-04 + +Event 203 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.830190702985664e+02 2.789429895135887e+02 -3.943102945050297e+02 -4.197918611657844e+00 + 3 5.247163710833167e+02 -4.266462829986154e+02 3.263988520595893e+01 3.037019215942699e+02 + 4 4.922645586181172e+02 1.477032934850268e+02 3.616704092990706e+02 -2.995040029826120e+02 + ME 5.844688813816974e-04 + +Event 204 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.952375769935189e+02 3.823764713153297e+01 6.531840992713524e+02 -2.350397908115461e+02 + 3 6.250862947179036e+02 1.031861473443960e+02 -5.506835576815645e+02 2.771878679515999e+02 + 4 1.796761282885782e+02 -1.414237944759291e+02 -1.025005415897879e+02 -4.214807714005372e+01 + ME 1.751151525398342e-04 + +Event 205 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.625197268936786e+02 2.955060596751036e+02 4.395356105446072e+02 -1.895074112086702e+02 + 3 3.144813194259644e+02 -1.941101430078122e+02 -7.073026664887026e+00 -2.473251401357733e+02 + 4 6.229989536803573e+02 -1.013959166672914e+02 -4.324625838797200e+02 4.368325513444434e+02 + ME 1.089572476080242e-04 + +Event 206 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.487698581700872e+02 -4.771827558939671e+02 -2.639484985605369e+02 6.145050708573942e+01 + 3 4.357856725513921e+02 1.877155863290790e+02 1.701172104948723e+02 3.545872893148350e+02 + 4 5.154444692785203e+02 2.894671695648880e+02 9.383128806566407e+01 -4.160377964005747e+02 + ME 4.137956585121730e-03 + +Event 207 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.289473514933904e+02 -3.230637718239222e+02 -3.258094337294262e+02 2.631792409740627e+02 + 3 3.730441408755687e+02 -1.145152671243400e+02 -7.298530142052728e+01 -3.474497523579300e+02 + 4 5.980085076310412e+02 4.375790389482623e+02 3.987947351499535e+02 8.427051138386733e+01 + ME 1.082416094240340e-04 + +Event 208 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.144460531270951e+02 3.105028133645123e+02 -3.495125011961061e+01 3.525242310830971e+01 + 3 7.230517599976930e+02 -6.554206809343710e+02 2.220922910679197e+02 2.095294558946057e+02 + 4 4.625021868752115e+02 3.449178675698587e+02 -1.871410409483092e+02 -2.447818790029154e+02 + ME 5.036266077357514e-04 + +Event 209 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.827014058170526e+02 -6.682954863774691e+01 -1.958656753088386e+02 -1.925890275057887e+02 + 3 5.969812148172334e+02 5.625717004655274e+02 1.060136244597390e+02 -1.692949027847389e+02 + 4 6.203173793657135e+02 -4.957421518277806e+02 8.985205084909933e+01 3.618839302905276e+02 + ME 1.059254286945194e-04 + +Event 210 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.369223392964550e+02 -2.366581006943837e+02 8.850719545688517e+01 -2.228813191927022e+02 + 3 6.926279093100446e+02 9.835546321295953e+01 -1.581805884470998e+02 6.671120783270954e+02 + 4 4.704497513935005e+02 1.383026374814241e+02 6.967339299021459e+01 -4.442307591343932e+02 + ME 5.975508326496533e-02 + +Event 211 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.754314663824422e+02 -1.965408456680789e+02 -5.399725108422632e+02 3.037689947684008e+01 + 3 6.656941886103589e+02 4.112771407945243e+02 5.114655840792436e+02 1.113679599883347e+02 + 4 2.588743450071987e+02 -2.147362951264454e+02 2.850692676301958e+01 -1.417448594651748e+02 + ME 4.429279610405934e-04 + +Event 212 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.922157374848575e+02 8.073316194509509e+00 4.947261155542873e+02 -3.254233732830556e+02 + 3 3.635572903001510e+02 8.951663862813328e+01 4.011175755255380e+01 3.500738802669425e+02 + 4 5.442269722149915e+02 -9.758995482264277e+01 -5.348378731068407e+02 -2.465050698388703e+01 + ME 2.978539039959498e-04 + +Event 213 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.434820262506831e+02 2.991548764052632e+02 2.111623598614187e+02 -6.470566753063677e+02 + 3 5.607612173038239e+02 -2.664197873565703e+02 -1.905271140771769e+02 4.551626726109782e+02 + 4 1.957567564454930e+02 -3.273508904869271e+01 -2.063524578424195e+01 1.918940026953895e+02 + ME 1.176354046325226e-04 + +Event 214 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.400874280734794e+02 3.457358963402696e+02 2.445843697627679e+02 -3.351710101016578e+02 + 3 3.400793067879316e+02 1.482066942304564e+02 1.256466447865830e+02 2.791086371729012e+02 + 4 6.198332651385894e+02 -4.939425905707262e+02 -3.702310145493509e+02 5.606237292875652e+01 + ME 1.428030491487673e-04 + +Event 215 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.916345321859862e+02 3.271767110560380e+02 -1.945589530122144e+02 9.208594000107220e+01 + 3 6.136750729169615e+02 -1.269585669220027e+02 2.644680756040780e+02 -5.390132228350478e+02 + 4 4.946903948970534e+02 -2.002181441340350e+02 -6.990912259186327e+01 4.469272828339764e+02 + ME 5.783250224164743e-05 + +Event 216 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.767411090262153e+02 1.602503356822860e+02 2.758455349572532e+02 -2.004069210086422e+02 + 3 4.061922956351254e+02 3.340053729931860e+02 2.237650079776778e+02 5.798114391563541e+01 + 4 7.170665953386591e+02 -4.942557086754720e+02 -4.996105429349309e+02 1.424257770930067e+02 + ME 1.246677660757186e-03 + +Event 217 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.474118977458854e+02 -5.378641111590875e+02 -3.279650037002521e+02 1.492759847325320e+02 + 3 5.088298200539714e+02 3.261878344469131e+02 1.555821256186315e+02 -3.581947579501666e+02 + 4 3.437582822001434e+02 2.116762767121744e+02 1.723828780816206e+02 2.089187732176346e+02 + ME 2.808592701363957e-05 + +Event 218 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.658501161076261e+02 -6.577627036244853e+02 -3.020200479570948e+01 9.895676706252428e+01 + 3 2.516345839620714e+02 1.565221509782131e+02 -1.156477271957936e+02 1.595192254662914e+02 + 4 5.825152999303024e+02 5.012405526462722e+02 1.458497319915031e+02 -2.584759925288157e+02 + ME 5.710861431711459e-04 + +Event 219 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.328556070633436e+02 6.122246558068493e+01 -1.687441385117925e+02 3.938796795879555e+02 + 3 6.500677455605623e+02 -3.703058656885360e+02 4.356876543064814e+02 -3.092537914719427e+02 + 4 4.170766473760947e+02 3.090834001078510e+02 -2.669435157946889e+02 -8.462588811601289e+01 + ME 2.966557992508409e-04 + +Event 220 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.686297280598668e+02 -3.497113779929074e+02 -8.765282776369959e+01 7.685577594963361e+01 + 3 4.155522773953193e+02 -1.777404948015451e+02 -1.525848366500188e+02 3.432344379292751e+02 + 4 7.158179945448152e+02 5.274518727944525e+02 2.402376644137180e+02 -4.200902138789081e+02 + ME 3.552942080819825e-03 + +Event 221 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.295220830718469e+02 3.654688468413811e+01 4.204675060608333e+02 3.197890523886257e+02 + 3 7.127556392876786e+02 -1.727486268095863e+02 -4.342549693537606e+02 -5.381460163035254e+02 + 4 2.577222776404743e+02 1.362017421254481e+02 1.378746329292729e+01 2.183569639148998e+02 + ME 3.152357661913820e-05 + +Event 222 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.464305981122427e+02 -2.054199106396077e+02 6.127423271580307e+01 1.215572638876956e+02 + 3 6.926647117218595e+02 4.702892479611936e+02 3.872350261814336e+02 -3.296383785530530e+02 + 4 5.609046901658980e+02 -2.648693373215859e+02 -4.485092588972366e+02 2.080811146653574e+02 + ME 7.038451789550760e-05 + +Event 223 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.463384302181124e+02 -1.209251938955737e+02 -2.140981972257043e+02 -1.488897673935926e+01 + 3 6.819620845265061e+02 -2.400891875757810e+02 5.819023806457058e+02 2.623339210620683e+02 + 4 5.716994852553809e+02 3.610143814713547e+02 -3.678041834200016e+02 -2.474449443227090e+02 + ME 4.113238561351750e-04 + +Event 224 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.236851263016065e+02 -8.671871524968941e+01 1.717231909970331e+02 1.141317038679677e+02 + 3 5.308972974363860e+02 -3.715833295101987e+01 4.680039348616381e+02 2.478780257941054e+02 + 4 7.454175762620065e+02 1.238770482007101e+02 -6.397271258586711e+02 -3.620097296620725e+02 + ME 8.897679002096311e-02 + +Event 225 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.094176014319267e+02 1.569347096242780e+02 -1.561291130928883e+00 -4.846394040251012e+02 + 3 7.252311334449814e+02 -3.845161955462209e+02 -4.374219820797173e+01 6.133466494377277e+02 + 4 2.653512651230915e+02 2.275814859219426e+02 4.530348933890066e+01 -1.287072454126262e+02 + ME 4.208384847365386e-04 + +Event 226 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.863217264048350e+02 -2.391756120967483e+02 -6.171186323675804e+02 1.816511279850092e+02 + 3 5.332348374442744e+02 1.096335504493486e+02 4.112484130583279e+02 -3.212391931833644e+02 + 4 2.804434361508906e+02 1.295420616473995e+02 2.058702193092524e+02 1.395880651983551e+02 + ME 3.176013098642929e-05 + +Event 227 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.243206345463227e+02 -5.280189925476209e+02 -1.406011303275692e+02 4.754657162080069e+02 + 3 5.487499634657127e+02 3.840442912861270e+02 -1.353123555187441e+01 -3.917312987222201e+02 + 4 2.269294019879643e+02 1.439747012614939e+02 1.541323658794436e+02 -8.373441748578678e+01 + ME 2.785367546706673e-04 + +Event 228 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.119578664379945e+02 1.625437651479949e+01 -1.806612394559917e+02 1.096514885776142e+02 + 3 6.254097456672617e+02 -3.200704000326812e+01 3.158243706171928e+02 5.388579277416935e+02 + 4 6.626323878947439e+02 1.575266348846865e+01 -1.351631311612011e+02 -6.485094163193077e+02 + ME 9.019763103929869e-01 + +Event 229 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.921227120343663e+02 -3.877491982207574e+02 4.449193714386763e+02 -4.802726626309341e+01 + 3 4.688278331283220e+02 3.470549659129083e+02 -1.517581364471262e+02 -2.762641051115459e+02 + 4 4.390494548373112e+02 4.069423230784908e+01 -2.931612349915501e+02 3.242913713746393e+02 + ME 3.165107183512214e-05 + +Event 230 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.261952284727869e+02 2.153699775439378e+02 -1.171086083390751e+02 3.486312082969336e+02 + 3 3.540619701921574e+02 3.070144260847320e+01 1.307424531367546e+02 3.276029778648148e+02 + 4 7.197428013350558e+02 -2.460714201524110e+02 -1.363384479767958e+01 -6.762341861617484e+02 + ME 3.215201095202690e-01 + +Event 231 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.205236024420394e+02 7.533931576750221e+01 -3.260217181731272e+02 -2.547036061581323e+02 + 3 5.397543491930861e+02 8.423195081267899e+01 -1.158376015978276e+02 5.204050211049135e+02 + 4 5.397220483648742e+02 -1.595712665801811e+02 4.418593197709548e+02 -2.657014149467810e+02 + ME 5.675089147239935e-04 + +Event 232 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.295782852421119e+02 3.239064445356881e+02 9.240815775655157e-01 2.821724019337123e+02 + 3 7.183371274312137e+02 -6.155391061575078e+02 -1.955291718271078e+02 -3.144649112405859e+02 + 4 3.520845873266733e+02 2.916326616218200e+02 1.946050902495421e+02 3.229250930687325e+01 + ME 7.565421699429522e-05 + +Event 233 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.640046126075326e+02 -2.220120664068516e+02 -1.165482463207536e+02 2.638683509799470e+02 + 3 4.682121509308885e+02 -1.009786196736113e+02 3.762431872847592e+02 2.597441061312977e+02 + 4 6.677832364615792e+02 3.229906860804629e+02 -2.596949409640056e+02 -5.236124571112447e+02 + ME 5.466722055378081e-03 + +Event 234 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 8.690043548936441e+01 -2.607433849884744e+01 -7.258333015587985e+01 4.004341073848801e+01 + 3 6.785651905172676e+02 -3.574930335951373e+02 -4.725723606052792e+01 5.748184081539155e+02 + 4 7.345343739933678e+02 3.835673720939847e+02 1.198405662164077e+02 -6.148618188924036e+02 + ME 2.018776041498044e-01 + +Event 235 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.000566282865330e+02 1.219146462304111e+01 -2.126850238006026e+02 2.113064812540423e+02 + 3 7.160981218147419e+02 2.575873756248089e+02 2.779062108697768e+02 -6.076293293985469e+02 + 4 4.838452498987245e+02 -2.697788402478499e+02 -6.522118706917435e+01 3.963228481445046e+02 + ME 4.426868843941219e-05 + +Event 236 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.510518772182422e+02 -9.497518588910036e+01 1.467158067736534e+01 1.165380984781943e+02 + 3 6.955499852411461e+02 5.933480346078575e+02 3.495450158124773e+02 9.770452249822526e+01 + 4 6.533981375406115e+02 -4.983728487187571e+02 -3.642165964898427e+02 -2.142426209764196e+02 + ME 1.204369674381906e-03 + +Event 237 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.173874152942701e+02 2.069918593916189e+02 -3.850229167793931e+01 -5.412237993169356e+01 + 3 7.305677895866183e+02 -6.701932224704495e+02 -2.421540700080861e+02 1.610333695687662e+02 + 4 5.520447951191119e+02 4.632013630788306e+02 2.806563616860255e+02 -1.069109896370727e+02 + ME 1.968332171347030e-04 + +Event 238 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.349573912113934e+02 -3.336495545457479e+02 -4.785400196851593e+02 2.506956580500141e+02 + 3 5.768887318987100e+02 4.812119270965609e+02 2.334547330568690e+02 -2.161818165921042e+02 + 4 2.881538768898969e+02 -1.475623725508128e+02 2.450852866282899e+02 -3.451384145790984e+01 + ME 9.649484883467785e-05 + +Event 239 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.349076725903782e+02 -5.331874414268931e+02 1.887721601290928e+01 -3.848403846142781e+01 + 3 3.658437465440001e+02 8.335465236419728e+01 1.670818061666300e+01 -3.558292926602242e+02 + 4 5.992485808656212e+02 4.498327890626960e+02 -3.558539662957237e+01 3.943133311216517e+02 + ME 8.663618478590563e-05 + +Event 240 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.870582387324442e+02 1.830793600232297e+02 -1.562409872742485e+02 1.564389154054251e+02 + 3 6.007192677438852e+02 3.433229388031108e+02 4.688113613010561e+02 -1.523446941819631e+02 + 4 6.122224935236704e+02 -5.264022988263405e+02 -3.125703740268074e+02 -4.094221223462029e+00 + ME 1.482124911655756e-04 + +Event 241 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.424696267657398e+02 4.823783107714220e+02 2.498315161211406e+02 5.061190823507635e+02 + 3 2.455726236162736e+02 -1.827879695947951e+02 -1.199757723946156e+02 -1.118046764652876e+02 + 4 5.119577496179859e+02 -2.995903411766270e+02 -1.298557437265251e+02 -3.943144058854758e+02 + ME 2.671069096983680e-03 + +Event 242 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.249130370348900e+02 1.676828147928013e+02 6.059046362201675e+02 -3.609168279440811e+02 + 3 6.240672718074164e+02 -4.529413961306756e+01 -5.490982345027016e+02 2.930862151720546e+02 + 4 1.510196911576932e+02 -1.223886751797336e+02 -5.680640171746590e+01 6.783061277202636e+01 + ME 3.947039622395617e-05 + +Event 243 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.655090712555230e+02 2.096323612054770e+02 2.113490506800235e+02 3.578890153850057e+02 + 3 5.764797256412519e+02 6.697224883641853e+01 -5.382210340689440e+02 -1.953502251008744e+02 + 4 4.580112031032260e+02 -2.766046100418948e+02 3.268719833889207e+02 -1.625387902841315e+02 + ME 2.373038569035547e-04 + +Event 244 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.237109195354749e+02 1.305098338947756e+02 -4.868141165486322e+02 -1.423106687020528e+02 + 3 5.804450110242352e+02 -4.045654344879671e+02 2.643676733537771e+02 3.214855413949400e+02 + 4 3.958440694402901e+02 2.740556005931916e+02 2.224464431948551e+02 -1.791748726928872e+02 + ME 2.656415514849882e-04 + +Event 245 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.629169357520612e+02 2.457511487795889e+02 -4.402365929491729e+01 -8.242333044139184e+01 + 3 6.931386101565748e+02 -5.195573187661655e+02 4.004017488088275e+02 -2.240084037645317e+02 + 4 5.439444540913644e+02 2.738061699865766e+02 -3.563780895139104e+02 3.064317342059234e+02 + ME 4.933262947502149e-05 + +Event 246 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.300937687157436e+02 -5.459948028041551e+02 3.085954426748103e+02 6.063567799240774e+01 + 3 1.673910408536142e+02 -3.546130270298914e+01 7.662824936562286e+01 -1.445350060290698e+02 + 4 7.025151904306418e+02 5.814561055071443e+02 -3.852236920404333e+02 8.389932803666211e+01 + ME 6.096838345621226e-04 + +Event 247 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.577847506495702e+02 2.418237207037818e+02 -8.449121421856779e+01 2.890502538162604e+01 + 3 5.130193185035739e+02 4.381905811488919e+02 1.366496386102691e+02 2.291390669832419e+02 + 4 7.291959308468561e+02 -6.800143018526737e+02 -5.215842439170128e+01 -2.580440923648678e+02 + ME 4.095241235940727e-03 + +Event 248 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.033207479153641e+02 -5.040306065309411e+02 -2.020637997366072e+02 4.469714117975367e+02 + 3 1.758360012551319e+02 -1.471306652922548e+01 -4.035460943683618e+00 -1.751728862172264e+02 + 4 6.208432508295036e+02 5.187436730601667e+02 2.060992606802908e+02 -2.717985255803104e+02 + ME 5.331973232924498e-04 + +Event 249 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.018816177222694e+02 5.523075638651412e+01 1.752331212074551e+02 2.395316845419020e+02 + 3 6.597415560701298e+02 6.315352823685415e+01 -6.561001191322722e+02 -2.834054254405022e+01 + 4 5.383768262076012e+02 -1.183842846233684e+02 4.808669979248172e+02 -2.111911419978518e+02 + ME 5.143373215718181e-04 + +Event 250 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.166381935101299e+02 -1.289072913913530e+02 -1.189615590004072e+02 -1.271344351215278e+02 + 3 6.815426093761063e+02 -2.511966318704652e+02 5.323234433390908e+02 3.435583388650891e+02 + 4 6.018191971137629e+02 3.801039232618180e+02 -4.133618843386822e+02 -2.164239037435609e+02 + ME 3.663205291365313e-04 + +Event 251 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.676961532387151e+02 -3.991265595084289e+01 -4.419965947723095e+02 4.988628500443887e+02 + 3 7.150412702460949e+02 3.921851524844912e+01 5.505653759000155e+02 -4.545587894617490e+02 + 4 1.172625765151895e+02 6.941407023942203e-01 -1.085687811277061e+02 -4.430406058263954e+01 + ME 5.881473572337537e-04 + +Event 252 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.112668789066533e+02 -1.147554660376938e+02 3.364589711187054e+01 -1.741632301749357e+02 + 3 7.393007599584276e+02 2.529046383258835e+02 -3.593132473314827e+02 5.945576909606565e+02 + 4 5.494323611349191e+02 -1.381491722881897e+02 3.256673502196121e+02 -4.203944607857206e+02 + ME 2.766175794580763e-03 + +Event 253 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.299659304470913e+01 -4.405884533650594e+01 -5.451291667290519e+01 2.038780663930336e+01 + 3 7.253475305576840e+02 3.245698054519170e+02 -1.402290280555607e+02 -6.333397991328418e+02 + 4 7.016558763976062e+02 -2.805109601154107e+02 1.947419447284657e+02 6.129519924935382e+02 + ME 6.932122881592155e-04 + +Event 254 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.982520535096858e+02 -6.164633378269740e+01 1.773450413210087e+02 -6.365801262063786e+01 + 3 7.183815394471146e+02 -1.984891252513598e+02 -6.893152145826988e+02 -3.896971029099804e+01 + 4 5.833664070431996e+02 2.601354590340572e+02 5.119701732616901e+02 1.026277229116358e+02 + ME 1.028356945275399e-04 + +Event 255 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.347080663542582e+02 -5.063606624096444e+02 1.592577719822621e+02 6.440929941880935e+01 + 3 2.475406015289463e+02 -1.856063881081878e+02 3.468010668896055e+00 -1.637516137347836e+02 + 4 7.177513321167951e+02 6.919670505178326e+02 -1.627257826511581e+02 9.934231431597432e+01 + ME 1.290934313414619e-03 + +Event 0 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.775677821222390e+02 4.314431287975208e+02 -2.652567205762378e+02 -2.776332864556196e+02 + 3 6.023469575940328e+02 -3.228069847179709e+02 5.005558924007595e+02 8.978477890465912e+01 + 4 3.200852602837276e+02 -1.086361440795499e+02 -2.352991718245217e+02 1.878485075509604e+02 + ME 3.040167867259869e-05 + +Event 1 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.241206267812559e+02 3.541578305635416e+02 -4.894807402105654e+02 3.991635230623179e+02 + 3 7.375567605136828e+02 -3.903081173548693e+02 4.920451519627784e+02 -3.867054653560790e+02 + 4 3.832261270506110e+01 3.615028679132772e+01 -2.564411752212871e+00 -1.245805770623896e+01 + ME 1.038930478375729e-03 + +Event 2 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.849204091734790e+02 2.108660079931152e+02 4.054727376659824e+02 1.620962335024329e+02 + 3 2.728468517759738e+02 4.961449545460115e+01 2.005017763154939e+02 1.782774356422519e+02 + 4 7.422327390505470e+02 -2.604805034477164e+02 -6.059745139814763e+02 -3.403736691446848e+02 + ME 2.765806449032608e-02 + +Event 3 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.264155576764489e+02 -4.170952165204416e+02 -7.054834331799707e+01 5.370977042744418e+01 + 3 7.108631972082329e+02 6.832597695609467e+02 -1.727180704166534e+02 -9.301097030017993e+01 + 4 3.627212451153183e+02 -2.661645530405051e+02 2.432664137346504e+02 3.930119987273574e+01 + ME 5.953735604318600e-05 + +Event 4 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.183269968238451e+02 -3.584978055671313e+02 -5.048824553914337e+02 -3.640971079361011e+02 + 3 7.387431276480258e+02 4.013538934928405e+02 5.036810263913360e+02 3.618865629982621e+02 + 4 4.292987552812848e+01 -4.285608792570925e+01 1.201429000097645e+00 2.210544937839321e+00 + ME 3.389146146191508e-04 + +Event 5 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.529780005473894e+02 -8.443182436392422e+01 4.445408460134586e+02 -2.106590230986441e+01 + 3 4.683757780543922e+02 -6.076819021151036e+01 -1.335482427838442e+02 -4.448010379662152e+02 + 4 5.786462213982178e+02 1.452000145754346e+02 -3.109926032296145e+02 4.658669402760799e+02 + ME 7.961184260108527e-05 + +Event 6 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.238848262005391e+02 -1.065131260140052e+02 -4.741487807795933e+02 -3.912418229627632e+02 + 3 1.729069432107233e+02 -1.460869767542721e+02 -8.199113358821990e+01 4.281191710484079e+01 + 4 7.032082305887382e+02 2.526001027682773e+02 5.561399143678132e+02 3.484299058579224e+02 + ME 4.860310346056595e-04 + +Event 7 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.977203086376782e+02 -6.126072843634399e+02 -1.744636661244187e+02 2.847602033865263e+02 + 3 1.614193396272252e+02 -4.571584237043671e+00 8.497734613495713e+01 -1.371646983269120e+02 + 4 6.408603517350969e+02 6.171788686004837e+02 8.948631998946141e+01 -1.475955050596143e+02 + ME 3.340281718714093e-04 + +Event 8 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.871091945484288e+02 4.059708628308462e+02 2.886614153103367e+02 4.732666173272760e+02 + 3 5.653302025665632e+02 -2.838835484844413e+02 -7.353399035097290e+01 -4.833229987253827e+02 + 4 2.475606028850082e+02 -1.220873143464048e+02 -2.151274249593636e+02 1.005638139810630e+01 + ME 7.748258592751941e-05 + +Event 9 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.618579955503452e+02 1.385215220188489e+01 1.601201234527701e+02 -1.917484467788566e+01 + 3 7.196660585644589e+02 -4.527189715496824e+02 -4.214090439733052e+02 3.679391067910630e+02 + 4 6.184759458851961e+02 4.388668193477976e+02 2.612889205205350e+02 -3.487642621131773e+02 + ME 1.101359209533348e-03 + +Event 10 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.832785200561162e+01 1.027681340851886e+01 -7.242726264265977e+01 -2.799877018853974e+01 + 3 7.448007230566494e+02 2.520540107528716e+02 6.813719334665398e+02 1.641011304445167e+02 + 4 6.768714249377393e+02 -2.623308241613905e+02 -6.089446708238800e+02 -1.361023602559769e+02 + ME 6.352982637705025e-04 + +Event 11 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.478627446486677e+02 2.070882322301630e+02 -4.708081692757452e+02 1.887000762823861e+02 + 3 6.997827604382593e+02 -4.209013422316021e+02 4.569873120768409e+02 -3.220257264800591e+02 + 4 2.523544949130733e+02 2.138131100014392e+02 1.382085719890431e+01 1.333256501976729e+02 + ME 3.013443607901958e-05 + +Event 12 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.802868936311938e+02 -4.467002255894126e+01 5.211262762381961e+02 -2.513262266832405e+02 + 3 5.208038834706859e+02 2.151797013176277e+01 -4.993650129388666e+02 -1.463155694111945e+02 + 4 3.989092228981198e+02 2.315205242717859e+01 -2.176126329932955e+01 3.976417960944350e+02 + ME 4.859982681736982e-04 + +Event 13 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.774880087360025e+02 1.576445054854711e+02 5.481077151088401e+02 -9.065617884226722e+01 + 3 5.915098138161557e+02 -3.018001633277128e+02 -3.808656371901899e+02 3.372564123391870e+02 + 4 3.310021774478421e+02 1.441556578422419e+02 -1.672420779186502e+02 -2.466002334969198e+02 + ME 1.502790901734900e-03 + +Event 14 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.531797527967491e+02 -8.400833666640551e+01 -2.384535242035554e+02 -1.350938161690894e+01 + 3 5.261064571264828e+02 -1.751971590790252e+02 -3.334570051994592e+02 3.672878780523887e+02 + 4 7.207137900767681e+02 2.592054957454308e+02 5.719105294030146e+02 -3.537784964354798e+02 + ME 3.474016539299094e-03 + +Event 15 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.605848765362424e+02 3.563504404614685e+02 1.735853700506503e+02 2.345653669687875e+02 + 3 4.216445088607454e+02 1.370719005416187e+02 -3.933730877164850e+02 6.521502736890034e+01 + 4 6.177706146030118e+02 -4.934223410030871e+02 2.197877176658347e+02 -2.997803943376878e+02 + ME 4.625741948268507e-04 + +Event 16 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.972484926572780e+02 -1.474122335888776e+02 -4.748950276275916e+02 -6.399787981958489e-01 + 3 5.072511849723049e+02 4.846784046822066e+02 1.224000792205880e+02 -8.607455661990269e+01 + 4 4.955003223704172e+02 -3.372661710933287e+02 3.524949484070037e+02 8.671453541809866e+01 + ME 5.855276662051888e-05 + +Event 17 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.182636773520260e+02 -9.176062613973060e+01 -1.890905041641619e+02 2.389906630959087e+02 + 3 6.376303990615819e+02 -4.240378519397394e+02 2.706855745366566e+02 -3.917827786765570e+02 + 4 5.441059235863918e+02 5.157984780794702e+02 -8.159507037249485e+01 1.527921155806483e+02 + ME 7.693381641318889e-05 + +Event 18 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.532560008158404e+02 -4.148613005881325e+02 1.689647846464810e+02 -3.247047971041213e+02 + 3 3.650144721835348e+02 -1.597348634907620e+02 -2.160675866909895e+02 2.470529017650751e+02 + 4 5.817295270006247e+02 5.745961640788945e+02 4.710280204450830e+01 7.765189533904639e+01 + ME 9.334842438108827e-05 + +Event 19 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.263687475619530e+02 -1.904667433734991e+02 2.390747946355329e+02 -1.143775398573919e+02 + 3 7.331345945903580e+02 2.597391859223820e+02 -6.739404183465076e+02 1.258022320965774e+02 + 4 4.404966578476883e+02 -6.927244254888296e+01 4.348656237109746e+02 -1.142469223918529e+01 + ME 9.609905451536893e-05 + +Event 20 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 9.588718605412235e+01 4.259536217794532e+01 8.056474827260675e+01 -2.982128277051556e+01 + 3 7.250265356668370e+02 3.120913743414048e+02 -4.446787057645158e+02 4.801284204484704e+02 + 4 6.790862782790413e+02 -3.546867365193502e+02 3.641139574919092e+02 -4.503071376779549e+02 + ME 3.832288824153771e-03 + +Event 21 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.825278201605080e+02 -1.533737674675501e+02 8.574830442242747e+01 4.939757963742075e+01 + 3 7.183016103669911e+02 1.713205736990390e+02 -6.275703015775030e+02 -3.045685162014730e+02 + 4 5.991705694725006e+02 -1.794680623148891e+01 5.418219971550753e+02 2.551709365640523e+02 + ME 8.607003146738376e-05 + +Event 22 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.349542451120770e+02 9.235159917618290e+01 -2.156570331301489e+02 -1.291214495308476e+01 + 3 7.360601907662837e+02 -2.182033070539752e+02 6.568866822530020e+02 -2.503433799808774e+02 + 4 5.289855641216395e+02 1.258517078777923e+02 -4.412296491228531e+02 2.632555249339621e+02 + ME 4.760022163892534e-05 + +Event 23 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.350908908124364e+02 -7.377772511691033e+00 -2.298431804723787e+02 -4.884063683135331e+01 + 3 6.797114625392685e+02 -5.485955088721075e+02 3.603976926464840e+02 1.765336882516069e+02 + 4 5.851976466482949e+02 5.559732813837986e+02 -1.305545121741054e+02 -1.276930514202538e+02 + ME 2.188313905173109e-04 + +Event 24 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.355364173804401e+02 2.538053291625626e+02 -2.665393838801487e+02 -2.328767540869265e+02 + 3 4.093863144993796e+02 -1.953012891316529e+02 -3.573484670764558e+02 4.191221827828568e+01 + 4 6.550772681201798e+02 -5.850404003090968e+01 6.238878509566048e+02 1.909645358086408e+02 + ME 1.927016128219592e-04 + +Event 25 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.365386968907906e+02 3.875876454009266e+02 3.151568854896984e+02 5.412404333367774e+02 + 3 5.208510884285564e+02 -2.430585576296288e+02 -1.518636440371933e+02 -4.349089876054082e+02 + 4 2.426102146806531e+02 -1.445290877712977e+02 -1.632932414525050e+02 -1.063314457313692e+02 + ME 3.545098492818308e-04 + +Event 26 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.198867014174700e+02 5.189601929589824e+02 4.797253921416957e+02 -1.370428003807496e+02 + 3 3.889101953712927e+02 -1.847394503243419e+02 -2.837815501141774e+02 1.912864537085460e+02 + 4 3.912031032112369e+02 -3.342207426346404e+02 -1.959438420275182e+02 -5.424365332779645e+01 + ME 1.136605271245841e-04 + +Event 27 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.732032222628646e+02 5.870808395006010e+02 -9.126179303429218e+01 3.165595544104447e+02 + 3 1.177373967283342e+02 7.847176641415683e+01 5.304379211899001e+00 -8.761358356661104e+01 + 4 7.090593810088013e+02 -6.655526059147578e+02 8.595741382239318e+01 -2.289459708438336e+02 + ME 1.559692831610762e-03 + +Event 28 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.475300414228806e+02 3.136396845517189e+02 3.816259196370642e+02 -4.186728559156668e+02 + 3 7.290923529036073e+02 -2.791764769994178e+02 -4.112865540505715e+02 5.333662195995522e+02 + 4 1.233776056735125e+02 -3.446320755230100e+01 2.966063441350738e+01 -1.146933636838856e+02 + ME 5.042181319532498e-02 + +Event 29 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.156754590345619e+02 -2.870540678871016e+02 4.159516713841875e+01 -1.245825012466667e+02 + 3 4.770060274033895e+02 -2.355061130652809e+02 -3.231858413754910e+02 -2.600433287405434e+02 + 4 7.073185135620483e+02 5.225601809523826e+02 2.815906742370723e+02 3.846258299872100e+02 + ME 7.977413909955766e-04 + +Event 30 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.091290614221006e+02 1.543004089904795e+02 4.216196287493841e+00 -5.892468251447810e+02 + 3 2.079357839022732e+02 2.034647466922836e+02 4.185675980476621e+01 9.348729279626955e+00 + 4 6.829351546756271e+02 -3.577651556827630e+02 -4.607295609226001e+01 5.798980958651542e+02 + ME 4.025729930611754e-04 + +Event 31 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.901710072855794e+02 1.433309098684658e+01 6.447948515477651e+02 -2.457034416076623e+02 + 3 5.898919363861645e+02 1.120085307876391e+02 -4.815950471622465e+02 3.217029626736536e+02 + 4 2.199370563282564e+02 -1.263416217744857e+02 -1.631998043855182e+02 -7.599952106599136e+01 + ME 2.341638051955361e-04 + +Event 32 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.144498311923273e+02 5.832947925341469e+02 -1.925283703230111e+02 1.576726595169116e+01 + 3 2.478450424037005e+02 5.004284035329789e+01 2.389954177960992e+02 4.247433867565734e+01 + 4 6.377051264039726e+02 -6.333376328874450e+02 -4.646704747308824e+01 -5.824160462734866e+01 + ME 2.082658212707269e-04 + +Event 33 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.134536717469738e+02 -1.625429495269566e+02 -1.853973484494194e+02 5.617232593785357e+02 + 3 5.361644687950270e+02 -3.755831293394987e+01 -9.992652347025610e+01 -5.254297294928765e+02 + 4 3.503818594579993e+02 2.001012624609065e+02 2.853238719196755e+02 -3.629352988565912e+01 + ME 1.082147339245727e-04 + +Event 34 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.840838099420727e+02 -2.442269925519277e+02 -3.827314394217586e+01 -2.939535943332559e+02 + 3 6.022630974514658e+02 3.956891925431131e+01 5.086724982658300e+02 3.200116071158651e+02 + 4 5.136530926064611e+02 2.046580732976164e+02 -4.703993543236541e+02 -2.605801278260915e+01 + ME 1.030778739400437e-04 + +Event 35 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.454350783663418e+02 -3.439607925797615e+02 2.363778141880091e+01 -2.139209721976717e+01 + 3 6.705698302143293e+02 5.215327591153250e+02 4.060443141865526e+02 -1.131171661597076e+02 + 4 4.839950914193289e+02 -1.775719665355635e+02 -4.296820956053536e+02 1.345092633794747e+02 + ME 5.272556806902332e-05 + +Event 36 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.098652154429358e+02 2.489290984574327e+02 -1.674080692141068e+02 -6.433641786725617e+02 + 3 6.178479130357197e+02 -1.435715807033598e+02 2.588953561477193e+02 5.423065917191846e+02 + 4 1.722868715213448e+02 -1.053575177540730e+02 -9.148728693361247e+01 1.010575869533772e+02 + ME 7.043439767078040e-05 + +Event 37 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.906872786346029e+02 1.495946561071237e+02 1.712833879510068e+02 6.521750966909805e+02 + 3 3.682276595245591e+02 -1.358558710218083e+02 1.194309698061993e+02 -3.207351477449753e+02 + 4 4.410850618408379e+02 -1.373878508531533e+01 -2.907143577572061e+02 -3.314399489460051e+02 + ME 1.993443501449107e-03 + +Event 38 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.131720166645958e+02 -5.222102655174089e+02 6.340623138461885e+00 3.213038392347352e+02 + 3 4.540063357567761e+02 2.932429176443923e+02 -3.207297067242505e+02 -1.313879727496970e+02 + 4 4.328216475786279e+02 2.289673478730168e+02 3.143890835857887e+02 -1.899158664850381e+02 + ME 2.556723680701482e-04 + +Event 39 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.929747896182307e+02 2.510117592312212e+02 -1.378648144805472e+02 6.181113983529410e+01 + 3 6.287164314722788e+02 3.864928360026034e+01 6.254120614625330e+02 5.148142827864524e+01 + 4 5.783087789094906e+02 -2.896610428314809e+02 -4.875472469819858e+02 -1.132925681139391e+02 + ME 1.792417727365923e-04 + +Event 40 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.143487538112954e+02 -3.203572478439017e+01 1.022340126870988e+02 3.996944439980560e+01 + 3 7.361483923235807e+02 5.924235295921244e+02 -3.838567751530157e+02 -2.088128187524163e+02 + 4 6.495028538651248e+02 -5.603878048077345e+02 2.816227624659169e+02 1.688433743526105e+02 + ME 2.303227362451932e-04 + +Event 41 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.384898508133351e+02 5.540399192408263e+02 -3.014826159773289e+02 -9.908223727147148e+01 + 3 3.510407251698805e+02 -1.719168197014114e+02 2.065966849440144e+02 -2.258140996521069e+02 + 4 5.104694240167846e+02 -3.821230995394149e+02 9.488593103331456e+01 3.248963369235784e+02 + ME 3.970587881169236e-05 + +Event 42 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.291654598309212e+02 -1.090829060981258e+02 2.972891943885482e+02 -8.983292515941632e+01 + 3 6.884965239796815e+02 4.933628807557017e+02 -2.919492821202986e+02 3.812953554581829e+02 + 4 4.823380161893969e+02 -3.842799746575758e+02 -5.339912268249619e+00 -2.914624302987665e+02 + ME 6.853707890242213e-04 + +Event 43 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.674173006007982e+02 2.791827424102563e+02 1.079644067383057e+02 2.130637369397045e+02 + 3 7.392205647816576e+02 -6.110484627794917e+02 -4.247874240022369e+01 -4.138385868609020e+02 + 4 3.933621346175443e+02 3.318657203692355e+02 -6.548566433808197e+01 2.007748499211975e+02 + ME 3.494804735294216e-05 + +Event 44 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.081359682230012e+02 -1.082501549908087e+02 1.771964605001424e+02 1.427934167997762e+01 + 3 7.449563315308092e+02 5.092828751965591e+02 -5.388739609944279e+02 7.215083562608926e+01 + 4 5.469077002461893e+02 -4.010327202057504e+02 3.616775004942854e+02 -8.643017730606689e+01 + ME 1.926996462037642e-04 + +Event 45 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.180982465404428e+02 4.470261481799606e+02 -3.368837017252438e+01 -2.597277606009550e+02 + 3 3.377595659674064e+02 -7.316527185649471e+01 2.454727770679006e+02 -2.201624016839131e+02 + 4 6.441421874921517e+02 -3.738608763234668e+02 -2.117844068953764e+02 4.798901622848686e+02 + ME 1.616288161456167e-04 + +Event 46 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.296560291524888e+02 2.172411497655985e+02 5.821614514430422e+02 -1.017892054705761e+02 + 3 6.224001894826197e+02 1.405102091633609e+01 -6.218608257778047e+02 2.176414579432110e+01 + 4 2.479437813648912e+02 -2.312921706819346e+02 3.969937433476264e+01 8.002505967625511e+01 + ME 4.055379741072412e-05 + +Event 47 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.458843469271558e+02 -1.019033861791133e+02 -1.559739004096152e+02 5.131058004898495e+02 + 3 2.573134207008559e+02 6.791700498899543e+01 -2.412204887508016e+02 5.839651284901167e+01 + 4 6.968022323719882e+02 3.398638119011773e+01 3.971943891604168e+02 -5.715023133388612e+02 + ME 1.419034531204010e-02 + +Event 48 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.623920218006384e+02 -6.284562032939594e+02 -1.837527125398962e+02 -1.002044496053409e+02 + 3 1.251779629744606e+02 -7.502448682133647e+01 9.550779386908961e+01 3.031682869117444e+01 + 4 7.124300152249010e+02 7.034806901152959e+02 8.824491867080656e+01 6.988762091416655e+01 + ME 8.358597237275323e-04 + +Event 49 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.397494808364366e+02 2.393958238941667e+02 -4.144666783354253e+00 -1.233996761053011e+01 + 3 6.782491241100330e+02 -3.516321535544010e+02 -2.705899831712921e+02 5.129890485673948e+02 + 4 5.820013950535310e+02 1.122363296602345e+02 2.747346499546463e+02 -5.006490809568649e+02 + ME 9.140198109432023e-03 + +Event 50 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.764898792162554e+02 4.667163214316568e+02 5.900817880915086e+01 -7.573978570375913e+01 + 3 5.114228101321805e+02 -2.035689445851523e+02 -4.549677995197112e+02 -1.145306811477843e+02 + 4 5.120873106515638e+02 -2.631473768465044e+02 3.959596207105603e+02 1.902704668515434e+02 + ME 5.185003130076207e-05 + +Event 51 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.678795643859633e+02 4.629737719234087e+02 5.365495313512260e+01 4.108186077915564e+01 + 3 6.311645871918953e+02 -4.500610707732840e+02 -4.345770688214701e+02 8.340587481742409e+01 + 4 4.009558484221419e+02 -1.291270115012473e+01 3.809221156863476e+02 -1.244877355965798e+02 + ME 1.555224976502730e-04 + +Event 52 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.696230029266819e+02 2.516704934433110e+02 2.514038675722595e+02 1.003953305301003e+02 + 3 6.696174214325738e+02 -2.754912388418390e+01 -6.493999246431116e+02 -1.609604756850079e+02 + 4 4.607595756407442e+02 -2.241213695591271e+02 3.979960570708520e+02 6.056514515490755e+01 + ME 6.141593565885964e-05 + +Event 53 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.284624742442375e+01 -4.271742504396477e+01 -2.683807109937144e+01 -5.255012179908527e+01 + 3 7.493542950735830e+02 3.356513586119742e+02 2.501807367708783e+02 6.215139772812375e+02 + 4 6.777994575019937e+02 -2.929339335680092e+02 -2.233426656715070e+02 -5.689638554821523e+02 + ME 1.643425481793044e-02 + +Event 54 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.460259847230066e+02 2.055186857047574e+01 6.233229443227744e+02 4.093908861479222e+02 + 3 5.756222844616436e+02 2.606063779094543e+01 -4.696411468594732e+02 -3.318117699890848e+02 + 4 1.783517308153498e+02 -4.661250636142105e+01 -1.536817974633011e+02 -7.757911615883737e+01 + ME 4.226998853404132e-04 + +Event 55 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.967428482894215e+02 -8.165820254184375e+01 5.098287527914878e+02 -2.991798919868828e+02 + 3 5.942526243827265e+02 5.606061544962814e+01 -2.905196430116550e+02 5.153559216750567e+02 + 4 3.090045273278509e+02 2.559758709221549e+01 -2.193091097798325e+02 -2.161760296881746e+02 + ME 1.754183168220650e-03 + +Event 56 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.610874267302015e+02 -4.199055433713192e+02 3.580252469767042e+02 1.015694718309908e+02 + 3 6.303091265298390e+02 2.130872195586830e+02 -5.453843477211296e+02 -2.333224059286980e+02 + 4 3.086034467399593e+02 2.068183238126362e+02 1.873591007444254e+02 1.317529340977073e+02 + ME 3.297744712996021e-05 + +Event 57 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.552053965855976e+02 4.516249927537604e+02 7.110694105335413e+00 4.746350341729918e+02 + 3 6.035190443408457e+02 -3.717228873476764e+02 2.148772607224587e+02 -4.241286299324849e+02 + 4 2.412755590735561e+02 -7.990210540608395e+01 -2.219879548277939e+02 -5.050640424050682e+01 + ME 1.593460885556305e-04 + +Event 58 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.959982971085279e+02 1.850007048157144e+02 -2.304987961744356e+02 1.612563397119954e+01 + 3 7.018897389129393e+02 -3.764226030262937e+02 4.376344751014919e+02 3.992884868423145e+02 + 4 5.021119639785323e+02 1.914218982105791e+02 -2.071356789270569e+02 -4.154141208135140e+02 + ME 4.613150284710148e-03 + +Event 59 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.521089721327344e+02 1.223876815062618e+02 -3.629066091228881e+01 -5.371485459866159e+02 + 3 4.098988410471213e+02 -5.841964900319320e+01 -3.626461945087766e+02 1.819119075553315e+02 + 4 5.379921868201441e+02 -6.396803250306868e+01 3.989368554210654e+02 3.552366384312844e+02 + ME 5.697586324432868e-05 + +Event 60 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.143828168925966e+02 -4.584044193456334e+02 -2.419772079280939e+02 -4.915844060170315e+02 + 3 1.284110307517518e+02 8.324300347118131e+01 -7.889851197070544e+01 5.774963203893761e+01 + 4 6.572061523556517e+02 3.751614158744521e+02 3.208757198987993e+02 4.338347739780939e+02 + ME 1.533448672467431e-04 + +Event 61 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.394390210968652e+02 -2.137451655543887e+02 -3.779414621253705e+02 -6.767502250635194e+01 + 3 4.431311911324731e+02 3.845666395406355e+02 -2.150363068358314e+02 4.725610065709544e+01 + 4 6.174297877706620e+02 -1.708214739862470e+02 5.929777689612019e+02 2.041892184925614e+01 + ME 1.381297151658138e-04 + +Event 62 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.301725729481176e+02 4.281927891852710e+02 5.652737593150771e+02 -1.739784429324868e+02 + 3 7.567373964415995e+01 2.589885732647598e+01 -5.696550981957816e+01 4.255225906941358e+01 + 4 6.941536874077224e+02 -4.540916465117469e+02 -5.083082494954988e+02 1.314261838630732e+02 + ME 7.786711423215849e-04 + +Event 63 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.361152320236989e+02 -3.738769057978322e+02 1.427754799584549e+02 -1.732850750548248e+02 + 3 5.817148313055658e+02 5.081993893256958e+02 2.829214478037172e+02 -8.998890070513916e+00 + 4 4.821699366707354e+02 -1.343224835278637e+02 -4.256969277621723e+02 1.822839651253388e+02 + ME 4.759619493182072e-05 + +Event 64 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.097675704107204e+02 3.288514690970509e+02 4.971291587853200e+02 -1.285916042465611e+02 + 3 5.709532610348123e+02 -6.501292612520261e+01 -4.768258747557200e+02 3.072426254385416e+02 + 4 3.192791685544673e+02 -2.638385429718483e+02 -2.030328402960006e+01 -1.786510211919805e+02 + ME 4.562874664526664e-04 + +Event 65 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.258641293880481e+02 3.743515439843765e+02 -1.622018320411498e+02 -4.746128903155365e+02 + 3 7.438702198751357e+02 -4.029113627030088e+02 2.325939036896868e+02 5.804355380128616e+02 + 4 1.302656507368159e+02 2.855981871863234e+01 -7.039207164853697e+01 -1.058226476973251e+02 + ME 6.470244641171997e-03 + +Event 66 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.731957242404369e+02 1.596860493342637e+01 -3.714568973276624e+02 3.224632809376675e+01 + 3 6.079923612940432e+02 4.451199598539357e+02 3.189341902600864e+02 -2.642043054431177e+02 + 4 5.188119144655197e+02 -4.610885647873621e+02 5.252270706757586e+01 2.319579773493509e+02 + ME 4.834805524912912e-05 + +Event 67 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.084256499213536e+02 6.318790977834965e+02 -2.229764540025608e+02 2.299504472951746e+02 + 3 5.168612394424736e+01 1.130069959366449e+01 -1.428140623590626e+01 4.837138651102396e+01 + 4 7.398882261343986e+02 -6.431797973771611e+02 2.372578602384670e+02 -2.783218338061984e+02 + ME 5.570675737266669e-02 + +Event 68 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.644037677826096e+02 -7.446914007305443e+01 3.170710956176409e+02 4.609467220707991e+02 + 3 4.303832728799333e+02 -1.588265612792408e+02 -3.994808673830752e+02 -2.046757440246668e+01 + 4 5.052129593374569e+02 2.332957013522950e+02 8.240977176543437e+01 -4.404791476683325e+02 + ME 8.149078876518943e-03 + +Event 69 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.379282923937936e+02 -4.413455715133101e+01 1.058497776082811e+02 -2.084654354245804e+02 + 3 5.822935131976620e+02 -5.806422676829346e+02 4.095409019445289e+01 -1.559022092337181e+01 + 4 6.797781944085447e+02 6.247768248342657e+02 -1.468038678027338e+02 2.240556563479523e+02 + ME 3.151643344815433e-04 + +Event 70 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.861861307467999e+02 1.831219916849830e+02 2.904683423406074e+02 -4.750880530376755e+02 + 3 4.633200606614190e+02 -4.245314712871158e+02 -1.339518705596282e+02 1.284344380284136e+02 + 4 4.504938085917810e+02 2.414094796021329e+02 -1.565164717809791e+02 3.466536150092621e+02 + ME 3.806065739739615e-05 + +Event 71 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.383412459951699e+02 5.748049255568963e+02 -1.639684737984460e+02 -4.334298474879633e+02 + 3 3.973981306646684e+02 -3.228684354469153e+02 -4.837114091238284e+00 2.316416412804533e+02 + 4 3.642606233401616e+02 -2.519364901099809e+02 1.688055878896842e+02 2.017882062075102e+02 + ME 2.401042559453063e-05 + +Event 72 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.538199915090661e+02 3.512029503136998e+02 -6.467835580753928e+00 -4.246458742680748e+01 + 3 5.344234504985294e+02 1.310173344785610e+01 3.836805260246263e+01 5.328833470497181e+02 + 4 6.117565579924037e+02 -3.643046837615557e+02 -3.190021702170875e+01 -4.904187596229106e+02 + ME 9.250471060538988e-03 + +Event 73 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.694927197571709e+02 1.451947293992221e+02 -1.807863847612341e+02 4.082379055705570e+02 + 3 5.537325951281177e+02 -5.796379956652486e+01 5.401382741253894e+02 -1.072876026015002e+02 + 4 4.767746851147115e+02 -8.723092983269748e+01 -3.593518893641554e+02 -3.009503029690568e+02 + ME 1.102130402813462e-03 + +Event 74 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.258444305735200e+02 -3.349227552763228e+02 4.941036656040853e+02 1.880679848209579e+02 + 3 5.555040664889823e+02 3.765538795180095e+01 -5.474422011270133e+02 -8.645158222500019e+01 + 4 3.186515029374983e+02 2.972673673245214e+02 5.333853552292793e+01 -1.016164025959579e+02 + ME 1.610729877029936e-04 + +Event 75 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.943316317993887e+02 5.588489849751633e+01 -2.552251009651267e+02 -2.953548066221912e+02 + 3 5.467466262348044e+02 -3.021648543602058e+02 -2.377479281839000e+02 3.887212326756534e+02 + 4 5.589217419658070e+02 2.462799558626894e+02 4.929730291490267e+02 -9.336642605346221e+01 + ME 1.443725900737479e-04 + +Event 76 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.517772830004059e+02 2.282681125856672e+02 -4.885490190451381e+02 -1.169260227747471e+02 + 3 4.245403880864563e+02 -2.793100283061228e+02 1.521744876196477e+02 -2.811821020654221e+02 + 4 5.236823289131380e+02 5.104191572045557e+01 3.363745314254903e+02 3.981081248401691e+02 + ME 4.757814473763855e-05 + +Event 77 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.781543446472004e+02 -5.926925448310482e+01 -1.775497893613220e+02 3.285786605157444e+02 + 3 6.702964816234125e+02 -6.066564226432875e+01 -1.057468051743550e+02 -6.591165802199179e+02 + 4 4.515491737293868e+02 1.199348967474336e+02 2.832965945356771e+02 3.305379197041733e+02 + ME 5.329303910865636e-05 + +Event 78 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.564262045363142e+02 1.882572856930395e+02 1.751822011208170e+02 -3.770878823051469e+02 + 3 3.809544602625753e+02 -2.816334489555118e+02 1.992812047321845e+02 -1.615422627793184e+02 + 4 6.626193352011105e+02 9.337616326247232e+01 -3.744634058530015e+02 5.386301450844653e+02 + ME 2.607637186752680e-04 + +Event 79 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.126536521478916e+02 6.075062399138448e+02 -4.178945028651391e+01 6.733726903166686e+01 + 3 2.872846052831653e+02 -1.084163947926164e+02 2.139961846825775e+01 2.651799127051088e+02 + 4 6.000617425689423e+02 -4.990898451212284e+02 2.038983181825617e+01 -3.325171817367755e+02 + ME 1.929445837008997e-03 + +Event 80 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.171281258707706e+02 -2.756641813219368e+02 1.445082905894676e+01 3.127240094205691e+02 + 3 3.805235327384963e+02 -2.955852199231463e+02 2.395269588958385e+02 7.373784162959280e+00 + 4 7.023483413907346e+02 5.712494012450838e+02 -2.539777879547847e+02 -3.200977935835285e+02 + ME 1.298366311664187e-03 + +Event 81 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.471091333863934e+02 -9.753029041192968e+01 7.407154559164039e+02 -7.162458282064984e-01 + 3 6.775352561453885e+02 9.550863422814814e+01 -6.702673865908516e+02 -2.595678293896890e+01 + 4 7.535561046821789e+01 2.021656183781575e+00 -7.044806932555213e+01 2.667302876717549e+01 + ME 8.484169004565843e-05 + +Event 82 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.309094465924172e+02 3.042233433179615e+02 2.799835808203349e+02 -1.214096495919827e+02 + 3 5.540384887187944e+02 -4.824447657759212e+02 1.988969596446624e+02 1.861335391629671e+02 + 4 5.150520646887883e+02 1.782214224579596e+02 -4.788805404649973e+02 -6.472388957098455e+01 + ME 1.083045798453776e-04 + +Event 83 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.869534474909296e+02 -4.727010820510886e+02 1.062322962656183e+02 4.890855018466119e+01 + 3 3.520990385354405e+02 -1.437544586613779e+02 -3.142298368411061e+02 6.758696761482641e+01 + 4 6.609475139736300e+02 6.164555407124666e+02 2.079975405754879e+02 -1.164955177994876e+02 + ME 2.987086265945440e-04 + +Event 84 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.391975815431583e+01 -3.682657486111166e-01 -1.138840508663313e+01 -7.995516055627094e+00 + 3 7.493632094786752e+02 -3.452281541586203e+01 3.833012084573050e+02 6.429880080772213e+02 + 4 7.367170323670086e+02 3.489108116447314e+01 -3.719128033706719e+02 -6.349924920215941e+02 + ME 3.916679740835151e-01 + +Event 85 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.362448947738020e+02 6.409220704967113e+02 3.243429451315054e+02 1.614840505254833e+02 + 3 1.517836214454495e+02 -1.266859291808411e+02 -6.780846852200752e+01 4.889738933094901e+01 + 4 6.119714837807480e+02 -5.142361413158706e+02 -2.565344766094980e+02 -2.103814398564324e+02 + ME 5.348397167404527e-04 + +Event 86 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.451728369778394e+02 -6.605005893803174e+01 1.066920544886257e+02 -5.305352178712970e+02 + 3 3.158718592284831e+02 -1.755596039144848e+02 2.550395858012224e+02 6.251932981237659e+01 + 4 6.389553037936776e+02 2.416096628525166e+02 -3.617316402898482e+02 4.680158880589204e+02 + ME 1.544425886663799e-04 + +Event 87 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.414211232216658e+02 1.437256906952883e+02 1.534640422371205e+02 -2.689983214749667e+02 + 3 5.081668091119998e+02 4.794742948200324e+02 -1.464748766741244e+02 8.296394996143997e+01 + 4 6.504120676663338e+02 -6.231999855153206e+02 -6.989165562996422e+00 1.860343715135267e+02 + ME 1.892507081880852e-04 + +Event 88 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.925516585730865e+02 1.655911293372512e+01 2.598275245766865e+02 -1.334238591297045e+02 + 3 7.159840369510271e+02 -1.056844973272874e+02 -3.694097043713192e+02 6.041526284885821e+02 + 4 4.914643044758866e+02 8.912538439356234e+01 1.095821797946327e+02 -4.707287693588777e+02 + ME 8.757051659862254e-02 + +Event 89 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.333634651097187e+02 1.209853522660007e+02 5.372166546881791e+02 -3.129058794565920e+02 + 3 6.221307427802805e+02 5.757192259699379e+01 -4.327483989541182e+02 4.432391657372765e+02 + 4 2.445057921100010e+02 -1.785572748629945e+02 -1.044682557340609e+02 -1.303332862806847e+02 + ME 5.429506382738342e-04 + +Event 90 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.111538587406461e+02 2.628215106651484e+02 -6.985334981761831e+01 -1.512021390726355e+02 + 3 5.216486323898988e+02 1.252715366480781e+02 4.457714554600226e+02 -2.402335265468457e+02 + 4 6.671975088694549e+02 -3.880930473132266e+02 -3.759181056424042e+02 3.914356656194811e+02 + ME 2.358905038211854e-04 + +Event 91 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.007803348469016e+02 8.390513937949673e+01 2.884042062049404e+02 -1.586667134655829e+01 + 3 6.256884422056424e+02 2.364580673743878e+02 -3.590826126759745e+02 -4.545693416378727e+02 + 4 5.735312229474563e+02 -3.203632067538847e+02 7.067840647103418e+01 4.704360129844310e+02 + ME 6.686750865072310e-05 + +Event 92 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.843865618656529e+02 -2.264962467301474e+02 -5.909185329480341e+02 2.605757158639088e+02 + 3 6.645516272550813e+02 3.453347116263075e+02 4.983670680340541e+02 -2.720350487207342e+02 + 4 1.510618108792659e+02 -1.188384648961601e+02 9.255146491398018e+01 1.145933285682523e+01 + ME 9.149986131393786e-05 + +Event 93 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.579763469381437e+02 2.180908585044469e+02 5.135246110359700e+02 8.151996049101450e+00 + 3 3.333821836060119e+02 1.681122988324203e+02 -1.261705574188214e+02 2.587719570738212e+02 + 4 6.086414694558449e+02 -3.862031573368672e+02 -3.873540536171489e+02 -2.669239531229221e+02 + ME 5.067568906762270e-04 + +Event 94 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.534979734151987e+02 1.139662723650678e+02 2.686183171543304e+01 4.381216071501100e+02 + 3 3.856184698299742e+02 1.545134372854229e+02 -3.452526490806396e+02 7.501873282757614e+01 + 4 6.608835567548277e+02 -2.684797096504911e+02 3.183908173652065e+02 -5.131403399776862e+02 + ME 7.034498121926024e-03 + +Event 95 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.828073115974177e+02 -5.711637476392463e+01 5.915078172645689e+01 -2.705898746219726e+02 + 3 6.809618671276162e+02 3.772100991821226e+02 3.247893528880089e+02 4.646864338535507e+02 + 4 5.362308212749671e+02 -3.200937244181981e+02 -3.839401346144664e+02 -1.940965592315787e+02 + ME 2.736640673720789e-04 + +Event 96 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.639832102051441e+02 -4.275497908582962e+02 -1.317248975374901e+02 -1.230046627491649e+02 + 3 7.474114851375484e+02 6.594176555428719e+02 2.654537688070380e+02 2.309254864669503e+02 + 4 2.886053046573076e+02 -2.318678646845757e+02 -1.337288712695478e+02 -1.079208237177853e+02 + ME 2.543637407455673e-04 + +Event 97 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.095921959312568e+02 3.190102848863560e+02 3.100341192456060e+02 2.485869851668986e+02 + 3 4.555541331018014e+02 -2.788120391899956e+02 2.221549471930724e+02 -2.836205112936887e+02 + 4 5.348536709669416e+02 -4.019824569636056e+01 -5.321890664386783e+02 3.503352612679006e+01 + ME 7.879492638365783e-05 + +Event 98 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.299941952467790e+02 -2.570048161992350e+02 -4.630296380940593e+02 -2.111695271961878e+01 + 3 7.352146396921255e+02 2.361229278157243e+02 6.962552486063584e+02 3.893348873424185e+00 + 4 2.347911650610957e+02 2.088188838351074e+01 -2.332256105122990e+02 1.722360384619465e+01 + ME 7.376532633734115e-05 + +Event 99 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.290897291078425e+02 3.747236205606835e+02 2.040795775432686e+02 -4.529602465443949e+01 + 3 6.438744429739487e+02 -5.215755139094103e+02 2.133414139578182e+01 3.769325350988583e+02 + 4 4.270358279182090e+02 1.468518933487271e+02 -2.254137189390505e+02 -3.316365104444187e+02 + ME 2.039319695622888e-03 + +Event 100 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.119062275524874e+02 -4.721600394809320e+02 -1.845880136125885e+02 7.099400083769525e+01 + 3 4.523854579707451e+02 2.836789572262426e+02 -3.060214184981774e+02 -1.747276258374610e+02 + 4 5.357083144767674e+02 1.884810822546894e+02 4.906094321107658e+02 1.037336249997658e+02 + ME 6.755424805663687e-05 + +Event 101 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.024072815192739e+02 -3.080418730730875e+02 -4.692284526425155e+02 2.186993289696520e+02 + 3 3.347434020484399e+02 8.940653726951260e+01 -3.939923552329939e+01 -3.201676381969582e+02 + 4 5.628493164322862e+02 2.186353358035749e+02 5.086276881658150e+02 1.014683092273061e+02 + ME 8.558948355107502e-05 + +Event 102 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.910857738801293e+02 3.707548039128416e+02 -7.516477307090545e+01 -4.541734518311493e+02 + 3 2.311218706704978e+02 4.536804143672514e+01 -2.262982016400413e+02 1.217307902336991e+01 + 4 6.777923554493721e+02 -4.161228453495667e+02 3.014629747109467e+02 4.420003728077793e+02 + ME 2.658621666912669e-04 + +Event 103 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.627949406417043e+02 7.189602123685953e+01 -6.391860825813610e+02 -1.599038689489492e+02 + 3 5.519979886399103e+02 1.442810582977180e+02 4.734454174874869e+02 2.444057944057306e+02 + 4 2.852070707183857e+02 -2.161770795345774e+02 1.657406650938741e+02 -8.450192545678139e+01 + ME 1.613104291263224e-04 + +Event 104 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.368180791462561e+02 -3.483499330357900e+02 -2.596280064690262e+02 4.533935023690695e+01 + 3 4.635715977792427e+02 1.873023362819024e+02 -2.251347602994603e+02 -3.593477435519052e+02 + 4 5.996103230745008e+02 1.610475967538876e+02 4.847627667684864e+02 3.140083933149983e+02 + ME 8.870389269707660e-05 + +Event 105 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.701708357490472e+02 2.288495716262108e+02 -4.521314661478371e+02 -2.613422905391967e+02 + 3 3.711008490497919e+02 -3.362590561223711e+02 -8.126001400906794e+01 1.343223639771668e+02 + 4 5.587283152011615e+02 1.074094844961604e+02 5.333914801569050e+02 1.270199265620299e+02 + ME 7.016289908316780e-05 + +Event 106 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.775588183099670e+02 5.149765831731703e+02 3.445381345095062e+02 -2.741870619150275e+02 + 3 7.044100837534631e+02 -4.546975847980704e+02 -4.392260662935806e+02 3.106833358270534e+02 + 4 1.180310979365711e+02 -6.027899837509906e+01 9.468793178407483e+01 -3.649627391202603e+01 + ME 3.285753323608975e-04 + +Event 107 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.046880513041550e+02 2.289413119004024e+02 -5.349774474143720e+02 -1.644160754103498e+02 + 3 3.366746442316214e+02 -7.166101576320898e+01 2.452245434825371e+01 3.280444544890399e+02 + 4 5.586373044642237e+02 -1.572802961371934e+02 5.104549930661184e+02 -1.636283790786902e+02 + ME 8.468114452980301e-04 + +Event 108 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.239206451413978e+02 -2.218030564243363e+02 5.011455197099735e+02 -2.982172759400455e+02 + 3 2.841199272340513e+02 1.209406641294798e+02 7.967327320293103e+01 2.444374323800143e+02 + 4 5.919594276245515e+02 1.008623922948564e+02 -5.808187929129044e+02 5.377984356003118e+01 + ME 1.676371005938700e-04 + +Event 109 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.093404598873124e+02 1.546999830656544e+02 1.629193992247174e+02 2.126421988200774e+02 + 3 5.287372542258961e+02 -2.136116696975048e+02 -1.865832176193536e+02 4.462284633214169e+02 + 4 6.619222858867910e+02 5.891168663185048e+01 2.366381839463621e+01 -6.588706621414941e+02 + ME 1.698115896932016e+01 + +Event 110 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.920948406187608e+02 -8.595212543403569e+01 -4.824913009925944e+02 -4.440392734262522e+01 + 3 4.634042325716594e+02 -2.085760624772916e+00 1.255608851371820e+02 4.460645653843308e+02 + 4 5.445009268095798e+02 8.803788605880843e+01 3.569304158554124e+02 -4.016606380417056e+02 + ME 4.079038282373521e-03 + +Event 111 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.637454700443118e+02 1.543048221589588e+02 -4.372769385391800e+02 6.225902899506631e+00 + 3 3.246747011850292e+02 -5.128652792678845e+01 -2.274142471268230e+02 2.259781269206006e+02 + 4 7.115798287706587e+02 -1.030182942321705e+02 6.646911856660031e+02 -2.322040298201072e+02 + ME 1.255922694311404e-03 + +Event 112 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.923761777814548e+02 3.939190124845535e+02 4.398224952082177e+01 -5.676954684419624e+02 + 3 5.277418353503031e+02 -4.270527740856185e+02 4.970714905179167e+01 3.060499505927538e+02 + 4 2.798819868682419e+02 3.313376160106501e+01 -9.368939857261344e+01 2.616455178492086e+02 + ME 6.058726848610135e-05 + +Event 113 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.174898838850695e+02 -6.130145063482009e+02 3.726797356942233e+02 1.071275347265524e+01 + 3 1.705115822510491e+02 3.993583199494100e+01 -1.624320619120163e+02 3.309311510932530e+01 + 4 6.119985338638814e+02 5.730786743532598e+02 -2.102476737822071e+02 -4.380586858198050e+01 + ME 2.021042103893936e-04 + +Event 114 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.772826088252361e+02 -1.430288042596956e+02 -3.410390118171984e+02 5.674036356844297e+02 + 3 6.725037798358684e+02 3.626161999767237e+01 2.510744134018114e+02 -6.228226615527176e+02 + 4 1.502136113388952e+02 1.067671842620232e+02 8.996459841538710e+01 5.541902586828807e+01 + ME 6.854173934730765e-05 + +Event 115 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 9.320551230331127e+01 1.288474310894607e+01 -2.581623869377880e+01 8.862715576190527e+01 + 3 6.672654287607166e+02 1.525114284892182e+02 2.829200767588877e+02 5.847560574856375e+02 + 4 7.395290589359722e+02 -1.653961715981643e+02 -2.571038380651087e+02 -6.733832132475428e+02 + ME 1.834493541349850e+00 + +Event 116 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.951202926530015e+02 -4.575339943514647e+02 4.220102313368785e+01 1.844608951947750e+02 + 3 3.101750696753587e+02 -4.711582585559527e+01 2.172188132736168e+02 2.163438466008693e+02 + 4 6.947046376716394e+02 5.046498202070600e+02 -2.594198364073050e+02 -4.008047417956444e+02 + ME 1.927887528540367e-03 + +Event 117 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.543248494478489e+02 1.390926466871539e+02 9.107024539473490e+01 6.328510524967591e+02 + 3 5.040443237953713e+02 6.874740772121054e+01 1.336336536624387e+02 -4.811200690999848e+02 + 4 3.416308267567792e+02 -2.078400544083643e+02 -2.247038990571737e+02 -1.517309833967742e+02 + ME 4.006310956243752e-04 + +Event 118 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.829230400014206e+02 5.307803371482089e+02 -3.192285892796672e+01 2.388565162167381e+02 + 3 3.965113090906140e+02 -5.470249758902820e+01 2.256187790844517e+02 -3.214420966810604e+02 + 4 5.205656509079653e+02 -4.760778395591807e+02 -1.936959201564850e+02 8.258558046432242e+01 + ME 6.932073481888603e-05 + +Event 119 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.549567073991256e+02 2.281637891139605e+02 1.474502150787006e+02 2.284600261271838e+02 + 3 4.727085372220641e+02 7.463684946128349e+01 -3.092948822053328e+02 3.495988811576870e+02 + 4 6.723347553788105e+02 -3.028006385752440e+02 1.618446671266322e+02 -5.780589072848709e+02 + ME 1.470931756443164e-02 + +Event 120 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.192117275853698e+02 4.094232477570927e+02 -5.552624156333899e+02 -2.032775518283800e+02 + 3 3.685061529232585e+02 -2.522084621786424e+02 1.741347663658646e+02 2.046087962197375e+02 + 4 4.122821194913712e+02 -1.572147855784500e+02 3.811276492675253e+02 -1.331244391357209e+00 + ME 8.451523281185753e-05 + +Event 121 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.923953846467517e+02 -5.182078839520094e+01 -1.486351786617837e+02 -1.106262789198433e+02 + 3 6.582127150877787e+02 -3.509182841037629e+02 -1.191939510078700e+02 5.439606035624541e+02 + 4 6.493919002654695e+02 4.027390724989639e+02 2.678291296696539e+02 -4.333343246426108e+02 + ME 1.991961208793247e-03 + +Event 122 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.905732817636245e+02 3.462508192534569e+02 -5.375670569609783e+02 -2.608131264380774e+02 + 3 7.097575386120016e+02 -2.677396278645660e+02 5.849221766424141e+02 2.998954860604125e+02 + 4 9.966917962437384e+01 -7.851119138889092e+01 -4.735511968143583e+01 -3.908235962233508e+01 + ME 5.024875501213438e-04 + +Event 123 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.035126033432559e+02 2.481103298242073e+01 -3.878573016343356e+02 -1.085059780294573e+02 + 3 3.541388771651664e+02 1.572344474048876e+02 -3.105653677404273e+02 -6.512161875550808e+01 + 4 7.423485194915778e+02 -1.820454803873083e+02 6.984226693747627e+02 1.736275967849660e+02 + ME 2.050184629797264e-02 + +Event 124 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.353042728143346e+02 -4.785252055946480e+02 -2.279396245170433e+02 7.488537693644093e+01 + 3 7.454081943698109e+02 6.785307544150929e+02 3.069354144183444e+02 -3.193811081429427e+01 + 4 2.192875328158540e+02 -2.000055488204448e+02 -7.899578990130101e+01 -4.294726612214667e+01 + ME 1.486260367599417e-04 + +Event 125 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.351681880566980e+02 -1.932492970253980e+01 -4.393064933429818e+02 -5.891592456452272e+02 + 3 6.537497908129355e+02 -2.883189353576721e+01 3.454898907503182e+02 5.542510679217787e+02 + 4 1.110820211303664e+02 4.815682323830691e+01 9.381660259266361e+01 3.490817772344844e+01 + ME 1.165457062729126e-04 + +Event 126 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.568747108147123e+02 1.149185667256989e+02 4.264979152236774e+02 -3.391204725116689e+02 + 3 6.934211462641821e+02 -1.939160042589617e+02 -6.294239612595662e+02 2.169215212257339e+02 + 4 2.497041429211052e+02 7.899743753326275e+01 2.029260460358889e+02 1.221989512859350e+02 + ME 4.065853990136104e-05 + +Event 127 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.108931196972316e+02 4.270547743949553e+02 5.664613189451065e+02 -4.598718776252147e+01 + 3 4.445675167124290e+02 -1.247884466860518e+02 -4.129475031266345e+02 1.074359351009545e+02 + 4 3.445393635903407e+02 -3.022663277089035e+02 -1.535138158184720e+02 -6.144874733843321e+01 + ME 1.120622321583768e-04 + +Event 128 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.312407894292421e+02 -7.192118124205533e+01 -4.398126160332176e+02 -2.891521793453568e+02 + 3 5.717192413787027e+02 3.434745903572437e+02 1.811915566412192e+02 4.195923218357252e+02 + 4 3.970399691920551e+02 -2.715534091151883e+02 2.586210593919984e+02 -1.304401424903685e+02 + ME 1.862188956509376e-04 + +Event 129 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.644129951428380e+02 -3.595672586482286e+02 4.645590915434781e+02 3.103882489514913e+02 + 3 1.967652372382453e+02 -5.204943416929044e+01 8.794498000645014e+00 -1.895522930301723e+02 + 4 6.388217676189165e+02 4.116166928175190e+02 -4.733535895441231e+02 -1.208359559213190e+02 + ME 2.908109601297223e-04 + +Event 130 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.302263990443511e+02 -1.919590472356484e+02 3.836584700935805e+02 -5.909217345563752e+02 + 3 4.156541164903923e+02 2.203243106780774e+02 -1.767969453775071e+02 3.049071707664833e+02 + 4 3.541194844652567e+02 -2.836526344242890e+01 -2.068615247160734e+02 2.860145637898919e+02 + ME 2.394368537213786e-05 + +Event 131 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.308323688168238e+02 -1.780469473698229e+02 1.469011263880862e+02 1.710582294195633e+00 + 3 7.308075033948297e+02 5.219262643529273e+02 -3.840435213624621e+02 3.379099810545738e+02 + 4 5.383601277883466e+02 -3.438793169831045e+02 2.371423949743758e+02 -3.396205633487695e+02 + ME 1.096116431922811e-03 + +Event 132 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.909630762789656e+02 -4.293852116769705e+02 -3.988922148105424e+02 7.583335995300345e+01 + 3 5.415993952096327e+02 2.260703809971038e+02 3.221145619770359e+02 -3.721079100067703e+02 + 4 3.674375285114019e+02 2.033148306798665e+02 7.677765283350676e+01 2.962745500537669e+02 + ME 2.827339250064479e-05 + +Event 133 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.506052863582995e+02 2.189991325227701e+02 -3.914006430783633e+02 -4.347459771134344e+01 + 3 4.043998006859108e+02 3.160348074769271e+02 8.738893432792007e+01 2.366946839598571e+02 + 4 6.449949129557899e+02 -5.350339399996972e+02 3.040117087504432e+02 -1.932200862485140e+02 + ME 3.155402842775054e-04 + +Event 134 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.151470882937615e+02 -1.041377497037514e+01 -4.186394096729772e+01 7.138447461686594e+02 + 3 3.416424731356658e+02 1.638631808685802e+02 3.081581136487585e+01 -2.981925940995342e+02 + 4 4.432104385705717e+02 -1.534494058982045e+02 1.104812960242201e+01 -4.156521520691246e+02 + ME 5.522638352617387e-02 + +Event 135 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.115730144432832e+02 -3.219296530898238e+02 2.184242454110169e+02 -5.958089478700319e+02 + 3 1.627059459894212e+02 -6.880794311551747e+01 -3.259803939022061e+01 1.437917231708342e+02 + 4 6.257210395672955e+02 3.907375962053413e+02 -1.858262060207963e+02 4.520172246991979e+02 + ME 2.042900794550245e-04 + +Event 136 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.195404287114590e+02 -4.369992732083462e+02 -4.270318019286998e+02 3.800182941743400e+02 + 3 6.668605996318225e+02 3.634158794560480e+02 4.690430049045652e+02 -3.043527845290678e+02 + 4 1.135989716567186e+02 7.358339375229816e+01 -4.201120297586537e+01 -7.566550964527266e+01 + ME 1.800654136529322e-03 + +Event 137 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.722782806745004e+02 -6.045581260407006e+02 -2.538460778300669e+02 1.484241478840623e+02 + 3 6.869263774705696e+02 6.661257235671317e+02 1.481819739565760e+02 -7.865412297735674e+01 + 4 1.407953418549305e+02 -6.156759752643100e+01 1.056641038734909e+02 -6.977002490670537e+01 + ME 5.228666064275611e-04 + +Event 138 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.463287544295633e+02 8.684709774942756e+01 2.409249839962013e+02 -5.934253049048401e+02 + 3 3.917330799270068e+02 1.767690441671677e+02 4.696120064017492e+01 3.464132742372293e+02 + 4 4.619381656434300e+02 -2.636161419165952e+02 -2.878861846363762e+02 2.470120306676108e+02 + ME 6.434051738930020e-05 + +Event 139 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.994802063237942e+02 -1.272876183039154e+02 6.552211336810932e+00 2.710042891410715e+02 + 3 7.257546970836095e+02 -8.848613612326723e+00 5.127896146768585e+00 -7.256826352181578e+02 + 4 4.747650965925944e+02 1.361362319162418e+02 -1.168010748357914e+01 4.546783460770865e+02 + ME 1.459660346124279e-04 + +Event 140 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.326756101999780e+02 5.655005379385240e+02 4.343799907428445e+02 1.683351270988810e+02 + 3 7.428339005597779e+02 -5.680473426214219e+02 -4.534832054058505e+02 -1.532233754243464e+02 + 4 2.449048924024402e+01 2.546804682897962e+00 1.910321466300584e+01 -1.511175167453447e+01 + ME 4.728268966313668e-03 + +Event 141 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.363238871411324e+02 -6.772722174663235e+02 -2.824373475598686e+02 -6.086341204880691e+01 + 3 5.504260535970959e+02 4.650298533191526e+02 2.914345410616539e+02 4.221355560271698e+01 + 4 2.132500592617707e+02 2.122423641471711e+02 -8.997193501785842e+00 1.864985644608984e+01 + ME 6.682959610872366e-05 + +Event 142 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.862280565156748e+02 4.248793793116474e+01 -2.479279504752428e+02 -5.295184989682996e+02 + 3 4.287264749982904e+02 -3.025296967755301e+02 2.785471849307630e+02 1.212173201341823e+02 + 4 4.850454684860337e+02 2.600417588443672e+02 -3.061923445552111e+01 4.083011788341167e+02 + ME 5.047408758770930e-05 + +Event 143 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.464531733710510e+02 4.046044690030689e+01 -2.103865804466287e+02 1.218179201483224e+02 + 3 5.378449948854584e+02 4.607829603950881e+02 -2.747641700963840e+02 3.822241180409942e+01 + 4 7.157018317434902e+02 -5.012434072953949e+02 4.851507505430127e+02 -1.600403319524220e+02 + ME 1.314936151730086e-03 + +Event 144 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.367418008803521e+02 -1.343004856786532e+02 -4.048537736989351e+02 -3.258044847458254e+02 + 3 6.294877130859599e+02 3.313530054622211e+02 5.282137272543232e+02 8.631468610520756e+01 + 4 3.337704860336883e+02 -1.970525197835678e+02 -1.233599535553879e+02 2.394897986406179e+02 + ME 3.080587460406394e-05 + +Event 145 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.805380148481771e+01 -3.411514819754512e+01 -4.339750646760406e+01 -3.980116822894492e+01 + 3 6.831461500979880e+02 -3.834019790669201e+02 -2.756424954453614e+02 -4.936727656514237e+02 + 4 7.488000484171945e+02 4.175171272644653e+02 3.190400019129655e+02 5.334739338803686e+02 + ME 4.881688276836065e-01 + +Event 146 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.031746658797123e+02 4.202301876294930e+02 2.767377273314875e+02 2.750283520766640e+00 + 3 4.317115817339341e+02 -1.098088257924671e+02 -5.455162180567243e+01 4.139336083717602e+02 + 4 5.651137523863538e+02 -3.104213618370259e+02 -2.221861055258150e+02 -4.166838918925268e+02 + ME 4.366905043615835e-03 + +Event 147 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.251223043705629e+02 -4.223502783198937e+02 -4.694338569631594e+01 1.206377286808446e+01 + 3 5.457819748703675e+02 2.791608945230573e+02 -4.384138579515957e+02 -1.665546403390878e+02 + 4 5.290957207590694e+02 1.431893837968363e+02 4.853572436479119e+02 1.544908674710035e+02 + ME 5.887647046052965e-05 + +Event 148 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.905785821272530e+02 6.249608768654492e+02 -6.243387159972372e+01 -2.870970082698921e+02 + 3 1.361638260920091e+02 2.862044352088506e+01 1.704210379179795e+01 1.320266050727364e+02 + 4 6.732575917807409e+02 -6.535813203863348e+02 4.539176780792521e+01 1.550704031971582e+02 + ME 8.852162684733297e-04 + +Event 149 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.694705528096943e+02 -5.216497821741067e+02 -3.785079074709545e+02 1.811189935345937e+02 + 3 2.821401257551277e+02 1.148500354702071e-01 2.786662494166578e+02 -4.413795199872403e+01 + 4 5.483893214351777e+02 5.215349321386363e+02 9.984165805429679e+01 -1.369810415358696e+02 + ME 1.853722035345691e-04 + +Event 150 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.637486188995367e+02 -4.033412855298820e+02 -2.279949807412008e+02 -1.992178895453991e+01 + 3 3.756800751656201e+02 6.230662615514298e+01 -2.632310737913946e+02 -2.606967683041707e+02 + 4 6.605713059348441e+02 3.410346593747393e+02 4.912260545325954e+02 2.806185572587108e+02 + ME 2.118053076647671e-04 + +Event 151 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.821954355913596e+02 -2.528320044280691e+02 2.861764538722268e+02 1.588602445142563e+01 + 3 6.796189325418251e+02 2.911670128135292e+02 -4.900375979142739e+02 3.700902818893582e+02 + 4 4.381856318668152e+02 -3.833500838546018e+01 2.038611440420471e+02 -3.859763063407838e+02 + ME 8.250156577235518e-03 + +Event 152 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.751133298339792e+02 -2.999578895043981e+02 -2.855974213275218e+02 -5.331391803034741e+02 + 3 4.976977783498468e+02 -3.003988119418482e+00 1.843802943840355e+02 4.622747685874795e+02 + 4 3.271888918161745e+02 3.029618776238166e+02 1.012171269434863e+02 7.086441171599445e+01 + ME 1.134099335487607e-04 + +Event 153 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.729293620257127e+02 1.558357805102956e+02 -7.193392860849491e+01 2.110174585940510e+01 + 3 6.524550819255464e+02 2.410158908712478e+02 5.786677971610501e+02 1.809766692333240e+02 + 4 6.746155560487410e+02 -3.968516713815435e+02 -5.067338685525552e+02 -2.020784150927291e+02 + ME 6.333956895505059e-04 + +Event 154 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.585658455851003e+02 -2.410305357139302e+02 -2.116446673272158e+02 -5.751693564652296e+02 + 3 5.764400833248006e+02 3.388133979948971e+02 3.092747322371399e+02 3.490527051926401e+02 + 4 2.649940710900989e+02 -9.778286228096692e+01 -9.763006490992422e+01 2.261166512725895e+02 + ME 4.954333779494758e-05 + +Event 155 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.686586231936362e+02 -1.693366246265499e+02 -1.542203680657918e+02 5.204938187588980e+02 + 3 1.882190564276537e+02 -1.089234770645493e+02 -9.145416397064868e+01 1.232810822434430e+02 + 4 7.431223203787106e+02 2.782601016910993e+02 2.456745320364405e+02 -6.437749010023409e+02 + ME 6.811316196354057e-01 + +Event 156 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.143652095725132e+02 2.879464601546110e+02 5.379391909976825e+02 -7.178351904348051e+01 + 3 6.287751645293093e+02 -4.584164185734782e+02 -4.225140875260601e+02 -8.181956094447750e+01 + 4 2.568596258981783e+02 1.704699584188668e+02 -1.154251034716223e+02 1.536030799879582e+02 + ME 2.935639485299561e-05 + +Event 157 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.050842109798971e+02 4.185498850973046e+02 -1.305174306570672e+02 -2.507812875014723e+02 + 3 5.170424494038049e+02 -3.084595065654855e+02 3.930456446728388e+02 -1.330441599566700e+02 + 4 4.778733396162974e+02 -1.100903785318191e+02 -2.625282140157716e+02 3.838254474581423e+02 + ME 4.142654907451842e-05 + +Event 158 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.312542366204099e+02 -3.114503370626313e+02 2.737030704635237e+02 1.185982013584742e+02 + 3 6.944315393047832e+02 2.166643175309469e+02 -6.173965008138002e+02 -2.326226495269425e+02 + 4 3.743142240748071e+02 9.478601953168445e+01 3.436934303502765e+02 1.140244481684682e+02 + ME 4.047758478186621e-05 + +Event 159 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.860112473308646e+02 -1.581297551692178e+02 4.935632758462007e+02 2.734948907463652e+02 + 3 3.772013313646349e+02 -2.371132827856262e+02 -1.305099443644436e+02 -2.627266448837395e+02 + 4 5.367874213045002e+02 3.952430379548442e+02 -3.630533314817573e+02 -1.076824586262577e+01 + ME 9.729641668910554e-05 + +Event 160 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.883409724804535e+02 -3.739819298758818e+02 -2.887651121595530e+02 3.505671490956299e+02 + 3 4.300332553173178e+02 1.788055146224819e+02 3.829208006453583e+02 7.955406370837679e+01 + 4 4.816257722022288e+02 1.951764152533999e+02 -9.415568848580531e+01 -4.301212128040067e+02 + ME 9.815491817147904e-03 + +Event 161 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.868305165969143e+02 4.119610488151632e+00 5.515184990814984e+02 4.093244831537709e+02 + 3 3.260821955312832e+02 -1.956999890649130e+02 -2.483451099187457e+02 -7.972338993006395e+01 + 4 4.870872878718019e+02 1.915803785767614e+02 -3.031733891627526e+02 -3.296010932237068e+02 + ME 1.060686882434836e-03 + +Event 162 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.159818802305118e+02 -2.018126805027918e+02 4.096951387107713e+01 -6.512536763314942e+01 + 3 6.870078865581223e+02 4.896730732821633e+02 -2.356527215298930e+02 -4.203188222421332e+02 + 4 5.970102332113653e+02 -2.878603927793715e+02 1.946832076588155e+02 4.854441898752826e+02 + ME 6.354991214413729e-05 + +Event 163 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.889699854403285e+02 -4.067839821807849e+01 -2.740835242435768e+02 4.028835269878221e+02 + 3 4.282392920294496e+02 4.007468150560175e+02 -8.832740907173850e+01 -1.224301852772270e+02 + 4 5.827907225302216e+02 -3.600684168379391e+02 3.624109333153152e+02 -2.804533417105952e+02 + ME 4.331344200689017e-04 + +Event 164 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.224346677404146e+02 -1.282049393554145e+02 5.480608628970116e+02 -2.657399098565701e+02 + 3 7.444531740822748e+02 1.794330131141779e+02 -6.708967511266459e+02 2.681638893170602e+02 + 4 1.331121581773107e+02 -5.122807375876334e+01 1.228358882296343e+02 -2.423979460490185e+00 + ME 1.496786244595179e-04 + +Event 165 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.980339706506675e+02 -5.154669325341684e+01 -4.947847840614098e+02 4.896757907618869e+02 + 3 1.362964882116331e+02 4.252532371924361e+01 -5.641238783031591e+01 -1.165588780002596e+02 + 4 6.656695411377010e+02 9.021369534174053e+00 5.511971718917263e+02 -3.731169127616273e+02 + ME 1.407471676007468e-03 + +Event 166 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.060640747281172e+02 -1.981167412190919e+02 -9.095380261170787e+01 -2.148310510107331e+02 + 3 5.580104478575087e+02 -3.585720992432472e+02 -1.558095186186281e+02 3.981521109704928e+02 + 4 6.359254774143742e+02 5.566888404623390e+02 2.467633212303362e+02 -1.833210599597597e+02 + ME 3.193485227416691e-04 + +Event 167 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.833153623322893e+02 2.526850217013923e+02 8.687924899084067e+01 9.417998957332070e+01 + 3 6.595685044563417e+02 -8.780626893611857e+01 -2.875856231737450e+02 -5.870393347553995e+02 + 4 5.571161332113691e+02 -1.648787527652739e+02 2.007063741829043e+02 4.928593451820789e+02 + ME 7.239516534120865e-05 + +Event 168 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.026267479353970e+02 -5.987968578530476e+02 5.775180228477304e+00 6.758674164241535e+01 + 3 4.991211680715714e+02 3.812575567959844e+02 3.220701575873952e+02 -5.952259631185695e+00 + 4 3.982520839930310e+02 2.175393010570631e+02 -3.278453378158729e+02 -6.163448201122965e+01 + ME 9.456339971215072e-05 + +Event 169 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.510662376679774e+02 -9.251111075413948e+01 -5.291920243323356e+02 -1.227660134875281e+02 + 3 5.034535790022879e+02 -2.816014265681678e+02 3.283802195198171e+02 2.575511098657944e+02 + 4 4.454801833297350e+02 3.741125373223072e+02 2.008118048125186e+02 -1.347850963782664e+02 + ME 1.525398117970454e-04 + +Event 170 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.814808559369750e+02 3.658097943502283e+01 -1.412301634042881e+02 -2.407225480659936e+02 + 3 6.646522150540472e+02 2.753499086551697e+02 -1.631412967142655e+02 5.825203104495404e+02 + 4 5.538669290089781e+02 -3.119308880901925e+02 3.043714601185536e+02 -3.417977623835468e+02 + ME 8.171407096612135e-04 + +Event 171 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.777965289077954e+02 -6.143496808852239e+01 -1.603735842336766e+00 1.668375809551635e+02 + 3 7.439290290569696e+02 2.163074211412066e+01 -1.907051550939618e+01 -7.433699124308462e+02 + 4 5.782744420352348e+02 3.980422597440168e+01 2.067425135173310e+01 5.765323314756826e+02 + ME 1.976071523686861e-03 + +Event 172 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.369499454750680e+02 -1.250080331667567e+01 -3.518152151649629e+01 -1.317622025690455e+02 + 3 6.692885586315896e+02 -2.346283187163472e+02 -6.130705295376303e+02 1.305421486874673e+02 + 4 6.937614958933425e+02 2.471291220330227e+02 6.482520510541266e+02 1.220053881578281e+00 + ME 5.307806726624245e-04 + +Event 173 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.088772083623148e+02 4.973951266878910e+01 3.171232495758635e+01 -7.064185769505257e+02 + 3 5.785136264307897e+02 8.584813303397833e+01 5.766505028397116e+01 5.691949191590091e+02 + 4 2.126091652068945e+02 -1.355876457027673e+02 -8.937737524155736e+01 1.372236577915166e+02 + ME 1.935050972667781e-04 + +Event 174 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.367208701713478e+02 -3.923163287174697e+01 4.325755195957346e+02 -4.543585887727656e+01 + 3 3.528978856725083e+02 9.622572295106897e+01 1.987077746703232e+02 -2.753048278549414e+02 + 4 7.103812441561447e+02 -5.699409007932230e+01 -6.312832942660565e+02 3.207406867322184e+02 + ME 9.278433788357707e-04 + +Event 175 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.418562164876807e+02 1.962785648722137e+02 -6.110736372974048e+02 -6.567908015856713e+00 + 3 4.843421844702150e+02 -1.886631806266161e+02 3.569879071908528e+02 -2.674942804112338e+02 + 4 3.738015990421036e+02 -7.615384245597570e+00 2.540857301065516e+02 2.740621884270907e+02 + ME 2.565607541647813e-05 + +Event 176 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.288652703123263e+02 4.005522031116294e+02 3.691482793515075e+02 3.142594606996526e+02 + 3 7.209127580467475e+02 -4.124575135572966e+02 -5.165298058232565e+02 -2.877341896975221e+02 + 4 1.502219716409257e+02 1.190531044566672e+01 1.473815264717492e+02 -2.652527100213051e+01 + ME 1.821371572252850e-04 + +Event 177 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.716578040000081e+02 -4.521622645932389e+02 -1.012739918234156e+01 1.338200520767546e+02 + 3 3.021382980750608e+02 -2.714821202364266e+02 6.773215888881046e+01 -1.140059832109246e+02 + 4 7.262038979249323e+02 7.236443848296656e+02 -5.760475970646935e+01 -1.981406886582875e+01 + ME 2.344625071758017e-03 + +Event 178 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.350088877399502e+02 -3.684484945749095e+02 -2.561732769425163e+02 -5.821159885132296e+02 + 3 1.415495174310248e+02 7.181268644032879e+01 1.095010133995263e+02 5.374692563910759e+01 + 4 6.234415948290248e+02 2.966358081345808e+02 1.466722635429900e+02 5.283690628741219e+02 + ME 8.638828111793180e-05 + +Event 179 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.426064621425415e+02 6.748632301344054e+01 7.201624948975953e+02 -1.681544967131679e+02 + 3 5.821031882499328e+02 8.394276920418474e-01 -5.588194474899292e+02 1.629854049874920e+02 + 4 1.752903496075257e+02 -6.832575070548242e+01 -1.613430474076661e+02 5.169091725675909e+00 + ME 8.369656241485756e-05 + +Event 180 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.099515195485482e+02 2.272495331206022e+02 1.762692760011278e+02 -5.378918555193874e+02 + 3 5.718889655176698e+02 4.324570510796980e+01 -3.278409766521432e+02 4.665909256493895e+02 + 4 3.181595149337818e+02 -2.704952382285720e+02 1.515717006510154e+02 7.130092986999803e+01 + ME 5.933432850657378e-05 + +Event 181 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.206370886915177e+02 -8.151225636567757e+01 1.767749325039422e+01 8.715827822142556e+01 + 3 6.451493408002738e+02 -6.748216257939075e+01 4.373428479320614e+02 4.694625256943417e+02 + 4 7.342135705082084e+02 1.489944189450684e+02 -4.550203411824557e+02 -5.566208039157672e+02 + ME 7.382610742608237e-02 + +Event 182 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.626866082364763e+02 -3.084610429505738e+02 3.306629079434072e+02 9.794245113140902e+01 + 3 4.974966719253475e+02 3.582955998671218e+02 1.664640547097976e+02 -3.023523113558579e+02 + 4 5.398167198381767e+02 -4.983455691654799e+01 -4.971269626532049e+02 2.044098602244490e+02 + ME 5.807434687853696e-05 + +Event 183 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.304723045950490e+02 3.244647182058515e+00 3.209425641774955e+02 7.872284845075714e+01 + 3 4.379804819457451e+02 2.312428523500661e+02 3.131807483468383e+02 2.006775141049615e+02 + 4 7.315472134592064e+02 -2.344874995321246e+02 -6.341233125243343e+02 -2.794003625557186e+02 + ME 4.982550210224409e-03 + +Event 184 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.470051035005912e+02 -4.953964753944514e+02 -4.028924750569615e+02 3.876552725878487e+02 + 3 2.183325716323390e+02 1.119040172022778e+02 1.451703047217021e+02 -1.186262424448778e+02 + 4 5.346623248670699e+02 3.834924581921737e+02 2.577221703352594e+02 -2.690290301429710e+02 + ME 5.224245380647260e-04 + +Event 185 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.448583927494090e+02 2.810173563272025e+02 -3.384637477435971e+02 6.610995769032235e+01 + 3 6.236443795626774e+02 -1.690803760724666e+02 5.125139620028375e+02 3.125277225134823e+02 + 4 4.314972276879136e+02 -1.119369802547359e+02 -1.740502142592404e+02 -3.786376802038046e+02 + ME 6.983427049018933e-03 + +Event 186 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.802792190696962e+02 -1.681815241656754e+02 5.427923640013703e+02 3.739936368565512e+02 + 3 6.331554869749547e+02 3.172201723440435e+02 -4.588808692389625e+02 -2.994755095011972e+02 + 4 1.865652939553488e+02 -1.490386481783679e+02 -8.391149476240781e+01 -7.451812735535422e+01 + ME 3.265372218842585e-04 + +Event 187 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.472897115267964e+02 -6.988402471604772e+02 -2.391684329048670e+02 1.134137672609268e+02 + 3 6.826908170748525e+02 6.328852277257668e+02 2.212839847556716e+02 -1.286718241709738e+02 + 4 7.001947139835137e+01 6.595501943471052e+01 1.788444814919546e+01 1.525805691004725e+01 + ME 1.270103171497999e-04 + +Event 188 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.496068877140277e+02 -5.024316730938292e+02 -3.980061777252907e+02 -1.055585379310703e+02 + 3 4.885976180718370e+02 4.424928723138696e+02 1.459942636040002e+02 -1.470148473169288e+02 + 4 3.617954942141355e+02 5.993880077995961e+01 2.520119141212904e+02 2.525733852479991e+02 + ME 2.535349419322286e-05 + +Event 189 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.082379946778651e+02 2.679237131173331e+02 -7.718184435750958e+01 2.981913934867988e+02 + 3 5.864211573889180e+02 -5.780822197382727e+02 -6.394893886953381e+01 7.497502433004088e+01 + 4 5.053408479332165e+02 3.101585066209396e+02 1.411307832270433e+02 -3.731664178168398e+02 + ME 1.973282874411826e-03 + +Event 190 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.472516823166367e+02 6.463779961822676e+02 -3.289365889632787e+01 6.945035458816682e+00 + 3 4.318767277050752e+02 -3.286790725415816e+02 -7.183748821760633e+00 -2.800642229191640e+02 + 4 4.208715899782886e+02 -3.176989236406860e+02 4.007740771808847e+01 2.731191874603473e+02 + ME 2.922185424766008e-05 + +Event 191 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.757500036387050e+02 6.222744522021635e+02 -2.261571472854043e+02 1.351499844096745e+02 + 3 3.644673602666566e+02 -2.020102809038697e+02 1.114149692296406e+02 -2.821613151026251e+02 + 4 4.597826360946378e+02 -4.202641712982939e+02 1.147421780557638e+02 1.470113306929506e+02 + ME 4.775983703239685e-05 + +Event 192 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.394562478491533e+02 -7.307873850878615e+02 3.988568028534699e+01 1.056147375500684e+02 + 3 8.098058518630977e+01 5.419286926826392e+01 4.244928426361276e+00 -6.002473390399247e+01 + 4 6.795631669645364e+02 6.765945158195975e+02 -4.413060871170818e+01 -4.559000364607595e+01 + ME 3.945927565047896e-04 + +Event 193 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.607395612273152e+02 -3.164229781907933e+02 -3.517992386171808e+02 -3.009030576558548e+02 + 3 3.741643617741926e+02 -2.156271676189966e+02 1.666697084176705e+02 2.563690747778811e+02 + 4 5.650960769984921e+02 5.320501458097899e+02 1.851295301995103e+02 4.453398287797368e+01 + ME 9.239067490162860e-05 + +Event 194 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.729373416862012e+02 -2.155045544874616e+02 -1.679805246197324e+02 5.035846779262560e+02 + 3 2.831035485618876e+02 -2.543279085173982e+02 1.042261812492671e+02 -6.783684323208051e+01 + 4 6.439591097519117e+02 4.698324630048597e+02 6.375434337046515e+01 -4.357478346941755e+02 + ME 1.765290140756257e-03 + +Event 195 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.572874060171201e+02 -5.433144409127298e+02 3.646295232533866e+01 1.185290019729285e+02 + 3 6.765845568040619e+02 5.574999049241243e+02 -1.212989803269169e+01 -3.831623469093195e+02 + 4 2.661280371788181e+02 -1.418546401139455e+01 -2.433305429264712e+01 2.646333449363910e+02 + ME 3.575005996922769e-04 + +Event 196 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.405888343305829e+02 3.940239871950471e+02 -8.826690628749978e+01 -3.594305754554688e+02 + 3 6.983754392688073e+02 -3.888370902622853e+02 -5.513072771506091e+01 5.774898910559966e+02 + 4 2.610357264006097e+02 -5.186896932761887e+00 1.433976340025607e+02 -2.180593156005277e+02 + ME 5.545922876141835e-03 + +Event 197 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.783346334111662e+02 2.282410890438732e+02 -1.474467226896363e+02 6.029624695020832e+01 + 3 6.434654504578667e+02 1.172104173128903e+01 6.205939438823053e+02 1.696277097949658e+02 + 4 5.781999161309676e+02 -2.399621307751624e+02 -4.731472211926696e+02 -2.299239567451741e+02 + ME 3.465199721360001e-04 + +Event 198 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.349536439683952e+02 1.774777254208014e+02 -9.709992209949115e+01 3.850427697141143e+02 + 3 4.134500153047131e+02 7.095914770071856e+01 -4.041194890923879e+02 -5.092301099466206e+01 + 4 6.515963407268930e+02 -2.484368731215194e+02 5.012194111918788e+02 -3.341197587194524e+02 + ME 7.876321607691774e-04 + +Event 199 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.682109290882580e+02 2.136897997740939e+02 -5.035763266519416e+02 3.837361052354048e+02 + 3 1.424120473397155e+02 8.952788458880865e+01 -4.686863299276860e+01 -1.003458038481504e+02 + 4 6.893770235720265e+02 -3.032176843629025e+02 5.504449596447103e+02 -2.833903013872543e+02 + ME 1.130528312945204e-03 + +Event 200 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.959952693237886e+02 -4.878566955018547e+02 -2.510837703973927e+01 -3.414319479966339e+02 + 3 4.479637599869171e+02 4.499951041477977e+01 7.146287716862109e+01 4.399313940955211e+02 + 4 4.560409706892943e+02 4.428571850870749e+02 -4.635450012888172e+01 -9.849944609888662e+01 + ME 5.328590142694770e-04 + +Event 201 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.203096708642927e+02 -1.112696379946441e+02 1.367824427202020e+02 4.895219960522141e+02 + 3 2.871951825199399e+02 -2.582762312778227e+02 1.200876310962787e+02 3.678888524092984e+01 + 4 6.924951466157675e+02 3.695458692724667e+02 -2.568700738164807e+02 -5.263108812931440e+02 + ME 6.616845404617044e-03 + +Event 202 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.158792376054218e+02 2.112389782008979e+01 -7.195062193526134e+01 -2.024369881546198e+02 + 3 5.463652944256572e+02 2.787950008966255e+02 -3.108926376755555e+02 -3.523267663221479e+02 + 4 7.377554679689214e+02 -2.999188987167153e+02 3.828432596108168e+02 5.547637544767679e+02 + ME 8.746084338295733e-03 + +Event 203 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.124273471334272e+02 4.879265047129838e+02 -1.059167473143779e+02 -5.081949365946949e+02 + 3 6.746108110440505e+02 -5.248642991835990e+02 4.352799102536775e+01 4.215714978711399e+02 + 4 1.129618418225217e+02 3.693779447061508e+01 6.238875628901039e+01 8.662343872355493e+01 + ME 6.471879264449323e-05 + +Event 204 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.084787759842806e+02 4.992472551829617e+02 -4.528122431715626e+02 -2.183012291454193e+02 + 3 1.034373169902747e+02 -8.959882065299325e+01 -3.938861547415053e+01 -3.346441176487074e+01 + 4 6.880839070254442e+02 -4.096484345299685e+02 4.922008586457130e+02 2.517656409102900e+02 + ME 2.801205615782351e-04 + +Event 205 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.496569846879349e+02 -5.869603795046560e+02 -2.345911576090251e+02 1.499956646614410e+02 + 3 2.543878192344406e+02 -1.851019090219872e+00 2.474675926596849e+02 -5.890268997594536e+01 + 4 5.959551960776247e+02 5.888113985948759e+02 -1.287643505065981e+01 -9.109297468549572e+01 + ME 1.780789057075663e-04 + +Event 206 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.172060642836409e+02 2.978040691523503e+02 4.166709400833432e+02 3.444435946201742e+02 + 3 7.205754982426179e+02 -2.468045809177360e+02 -5.690387091428451e+02 -3.667580878490107e+02 + 4 1.622184374737408e+02 -5.099948823461420e+01 1.523677690595017e+02 2.231449322883639e+01 + ME 8.133674600963182e-05 + +Event 207 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.250113096394139e+02 -1.091977068802181e+02 -4.322753509449321e+02 2.772196909074646e+02 + 3 5.240251005653129e+02 3.541948269240045e+02 3.738549241960731e+02 9.685466564450641e+01 + 4 4.509635897952731e+02 -2.449971200437864e+02 5.842042674885890e+01 -3.740743565519710e+02 + ME 3.395316586355856e-03 + +Event 208 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.449444343820048e+02 1.928662436733418e+02 -3.595193210859464e+02 1.775500478872298e+02 + 3 4.894053462810563e+02 -2.195789585225566e+02 2.295326432211599e+02 3.723136307450180e+02 + 4 5.656502193369389e+02 2.671271484921491e+01 1.299866778647864e+02 -5.498636786322478e+02 + ME 2.076512582794265e-01 + +Event 209 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.949423498078045e+02 -2.830370809537592e+02 -1.684680620467475e+02 -3.694271951395290e+02 + 3 6.326444171345162e+02 3.898538983719824e+02 -1.748162179498051e+02 4.665749526039371e+02 + 4 3.724132330576787e+02 -1.068168174182232e+02 3.432842799965526e+02 -9.714775746440787e+01 + ME 1.539925201273449e-04 + +Event 210 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.469464199121013e+02 -4.947084169679944e+02 2.319240083666634e+02 -2.500445517953787e+01 + 3 2.929141603572805e+02 -5.602902696925144e+01 2.099470855189297e+01 2.867379913571110e+02 + 4 6.601394197306176e+02 5.507374439372460e+02 -2.529187169185561e+02 -2.617335361775728e+02 + ME 1.527282387432709e-03 + +Event 211 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.484404249965426e+02 1.659778109685240e+01 3.514591842057613e+02 -4.206992456262191e+02 + 3 4.635537606517393e+02 -3.607884938122542e+02 -3.140996451540818e+01 2.893564685231623e+02 + 4 4.880058143517180e+02 3.441907127154017e+02 -3.200492196903531e+02 1.313427771030569e+02 + ME 5.301242403965074e-05 + +Event 212 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.930853388432640e+02 -3.424793196872474e+02 -8.152110066892752e+01 5.970171795281683e+02 + 3 9.131624224772825e+01 6.738328155058524e+01 1.365968298972706e+01 6.009627714210347e+01 + 4 7.155984189090078e+02 2.750960381366621e+02 6.786141767920040e+01 -6.571134566702718e+02 + ME 3.228977178268723e-01 + +Event 213 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.316448870278512e+02 4.203233031264803e+02 4.913598772661251e+02 -3.423419819067778e+02 + 3 4.750162603483208e+02 -1.726357548525294e+02 -3.708603862154638e+02 2.414537588813190e+02 + 4 2.933388526238279e+02 -2.476875482739507e+02 -1.204994910506614e+02 1.008882230254589e+02 + ME 3.396976850624664e-05 + +Event 214 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.805779599533694e+02 3.904513572450257e+02 -1.742898429406511e+02 2.193763065287195e+02 + 3 6.164938851206517e+02 -5.563771061772993e+02 2.227142270499353e+02 1.445946028815716e+02 + 4 4.029281549259790e+02 1.659257489322735e+02 -4.842438410928419e+01 -3.639709094102910e+02 + ME 1.136455087245070e-02 + +Event 215 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.610896439725646e+02 -3.106576460930040e+02 -3.050258363865883e+02 -1.518378274323046e+02 + 3 7.153470686812822e+02 2.726436938726978e+02 6.046054769368645e+02 2.680280994976065e+02 + 4 3.235632873461536e+02 3.801395222030654e+01 -2.995796405502761e+02 -1.161902720653026e+02 + ME 2.210597722197607e-04 + +Event 216 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.309452696424390e+02 -4.912950836090374e+02 -3.608909251460835e+01 -1.980646298023531e+02 + 3 6.627369363365401e+02 4.479096066616001e+02 2.308759280187053e+02 4.304573578259470e+02 + 4 3.063177940210213e+02 4.338547694743725e+01 -1.947868355040969e+02 -2.323927280235939e+02 + ME 1.887136196869648e-03 + +Event 217 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.608032244164872e+02 2.215832851737383e+02 3.318832460795877e+02 -2.304212888079595e+02 + 3 3.107022283044696e+02 -4.724697178681159e+01 2.830528592337837e+02 -1.190994425256425e+02 + 4 7.284945472790436e+02 -1.743363133869267e+02 -6.149361053133714e+02 3.495207313336019e+02 + ME 2.899332746907801e-03 + +Event 218 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.336891602166278e+02 5.249943224110906e+02 1.648031440577740e+02 -3.142973702098818e+02 + 3 5.195346944320728e+02 -3.655895580768900e+02 -3.610279413409488e+02 7.693763263116620e+01 + 4 3.467761453512955e+02 -1.594047643342020e+02 1.962247972831738e+02 2.373597375787181e+02 + ME 2.665422453699515e-05 + +Event 219 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.579228498517418e+02 -4.166553381892275e+01 1.191899344508914e+02 2.249042891828000e+02 + 3 7.453266221408655e+02 -3.354388163550536e+01 -3.947818065141065e+02 -6.312954196904916e+02 + 4 4.967505280073932e+02 7.520941545442811e+01 2.755918720632151e+02 4.063911305076915e+02 + ME 7.854533112887660e-05 + +Event 220 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.940336288355579e+02 -2.383755021420816e+02 -2.918661661143954e+02 3.194690712363630e+02 + 3 7.129224521449783e+02 2.727447507998267e+02 2.535039959962390e+02 -6.079510240944472e+02 + 4 2.930439190194636e+02 -3.436924865774515e+01 3.836217011815622e+01 2.884819528580837e+02 + ME 1.717959371059500e-04 + +Event 221 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.305414381337778e+02 -2.712796684963201e+02 -1.199910663213095e+02 -1.458325333632649e+02 + 3 7.388441803280767e+02 5.510455284380057e+02 4.375213740715826e+02 2.254209298704556e+02 + 4 4.306143815381457e+02 -2.797658599416857e+02 -3.175303077502730e+02 -7.958839650719051e+01 + ME 1.436997677454732e-04 + +Event 222 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.657562074797755e+02 2.823280548971349e+02 2.956503281023744e+02 2.231828795335844e+02 + 3 4.791948192186352e+02 -3.228825926298714e+02 2.575611801233854e+02 -2.429747818931872e+02 + 4 5.550489733015892e+02 4.055453773273639e+01 -5.532115082257600e+02 1.979190235960288e+01 + ME 8.848913744956517e-05 + +Event 223 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.612164685986320e+02 -4.527922182271190e+01 -1.095260585492910e+01 1.543391792239739e+02 + 3 6.984218503485874e+02 -4.629950983513679e+02 2.605715575888555e+02 -4.533553609726804e+02 + 4 6.403616810527803e+02 5.082743201740798e+02 -2.496189517339264e+02 2.990161817487065e+02 + ME 4.363501574346948e-04 + +Event 224 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.663853414671973e+02 -1.350882138037310e+02 9.706071747767020e+01 3.804401292344737e+00 + 3 6.436745581417565e+02 -4.469273298203082e+02 -4.412749113764767e+02 -1.408877256838113e+02 + 4 6.899401003910461e+02 5.820155436240392e+02 3.442141938988061e+02 1.370833243914661e+02 + ME 3.615602246092481e-04 + +Event 225 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.702356777533546e+02 6.117158080352369e+02 -2.649249521350114e+02 -6.952987609335720e+01 + 3 6.901224376513153e+02 -6.564819557015361e+02 1.560869289536551e+02 1.446972404640001e+02 + 4 1.396418845953297e+02 4.476614766629927e+01 1.088380231813564e+02 -7.516736437064299e+01 + ME 6.427659719915406e-04 + +Event 226 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.307777643673113e+02 -4.569648094661606e+02 4.416236342013199e+02 -3.608155616351098e+02 + 3 1.446420186345138e+02 4.133161435221924e+01 -3.411742569426914e+01 1.343466131828505e+02 + 4 6.245802169981753e+02 4.156331951139413e+02 -4.075062085070508e+02 2.264689484522593e+02 + ME 4.396758174746568e-04 + +Event 227 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.408615397889291e+02 -4.398089081634772e+02 -5.325812259979131e+02 2.679574278743412e+02 + 3 4.035753807128125e+02 3.000971513323747e+02 2.468113220276344e+02 -1.090823496201683e+02 + 4 3.555630794982586e+02 1.397117568311025e+02 2.857699039702786e+02 -1.588750782541728e+02 + ME 3.108259965567685e-04 + +Event 228 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.775455372723294e+02 -3.656199842755111e+02 -6.289501053880601e+01 4.426342647953073e+02 + 3 3.247306314578497e+02 8.776645762339837e+01 3.116872137482897e+02 2.445634292125525e+01 + 4 5.977238312698206e+02 2.778535266521127e+02 -2.487922032094836e+02 -4.670906077165625e+02 + ME 3.155752151261513e-03 + +Event 229 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.665477125629452e+02 -2.081014917770363e+02 2.317985113364040e+02 -1.931850016112187e+02 + 3 6.187040836990478e+02 -2.134593092471877e+02 -3.484367286517815e+02 4.645661552545953e+02 + 4 5.147482037380066e+02 4.215608010242241e+02 1.166382173153775e+02 -2.713811536433765e+02 + ME 4.511168094162984e-04 + +Event 230 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.913978529013565e+02 -4.986092821675884e+02 -3.028328044703766e+02 9.712104143419771e+01 + 3 3.439186614041001e+02 -6.573524045766425e+01 3.216488491089061e+02 -1.024741025375549e+02 + 4 5.646834856945436e+02 5.643445226252528e+02 -1.881604463852933e+01 5.353061103357446e+00 + ME 1.026777973262958e-04 + +Event 231 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.760768557894827e+02 -7.075794524290799e+01 5.609870884449791e+02 1.102331327656218e+02 + 3 6.038619762337338e+02 -2.467027894308989e+02 -5.464177649873398e+02 -7.221250677108812e+01 + 4 3.200611679767834e+02 3.174607346738069e+02 -1.456932345763944e+01 -3.802062599453370e+01 + ME 8.813656492911543e-05 + +Event 232 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.230187249684843e+02 -2.426041066061352e+02 1.884455685697195e+02 -6.545132479937492e+02 + 3 4.821326920133731e+02 2.438648429837413e+02 -1.563760752388982e+01 4.156168142598493e+02 + 4 2.948485830181424e+02 -1.260736377606032e+00 -1.728079610458298e+02 2.388964337338999e+02 + ME 4.301353051471335e-05 + +Event 233 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.540260977608100e+02 -1.904526694678991e+02 -1.042089619355360e+02 -2.796475475319170e+02 + 3 4.925592302096041e+02 1.195034224421750e+02 3.554637678715695e+02 -3.193415679485398e+02 + 4 6.534146720295859e+02 7.094924702572415e+01 -2.512548059360335e+02 5.989891154804570e+02 + ME 2.564821772678246e-04 + +Event 234 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.866526101194274e+02 7.776953530733711e+01 -1.047503781897389e+01 1.693557493124072e+02 + 3 6.012752698516813e+02 5.974840035795013e+02 -4.570329760029637e+01 4.955829083294179e+01 + 4 7.120721200288896e+02 -6.752535388868375e+02 5.617833541927042e+01 -2.189140401453492e+02 + ME 2.281324305129538e-03 + +Event 235 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.032945404607943e+02 1.612889276925247e+02 2.561838854094329e+02 -4.020710050699557e+02 + 3 7.153634726767364e+02 -3.739069589148945e+02 -1.979140468542056e+02 5.768609140624164e+02 + 4 2.813419868624689e+02 2.126180312223699e+02 -5.826983855522716e+01 -1.747899089924608e+02 + ME 8.352793145677155e-04 + +Event 236 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.980797829886611e+02 -9.803971882836205e+00 4.740144261428888e+02 5.123764137440797e+02 + 3 5.519387921056283e+02 -1.638876688381594e+02 -3.209728652821290e+02 -4.180355032606608e+02 + 4 2.499814249057108e+02 1.736916407209956e+02 -1.530415608607599e+02 -9.434091048341890e+01 + ME 2.721412302642413e-04 + +Event 237 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.604490925133744e+02 6.212857081252701e+01 9.075394990141039e+01 1.168232534834160e+02 + 3 6.578242662283154e+02 5.348507070161563e+02 -3.810396531957999e+02 3.842224792439631e+01 + 4 6.817266412583111e+02 -5.969792778286831e+02 2.902857032943891e+02 -1.552455014078122e+02 + ME 8.848909205484966e-04 + +Event 238 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.789018340499541e+02 1.069933592962544e+02 -2.572713415352737e+02 1.225197647611559e+01 + 3 4.761759619803054e+02 7.755191627191857e+01 -4.591043622469822e+02 -9.976187456245110e+01 + 4 7.449222039697414e+02 -1.845452755681728e+02 7.163757037822558e+02 8.750989808633528e+01 + ME 4.151281620443684e-02 + +Event 239 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.581461811054764e+02 -3.899520773556200e+02 2.006122777919944e+02 1.326273524830990e+02 + 3 3.013476461129690e+02 -2.996604136348060e+02 3.145663680794621e+01 4.951799549362093e+00 + 4 7.405061727815548e+02 6.896124909904260e+02 -2.320689145999406e+02 -1.375791520324611e+02 + ME 1.358758913363619e-02 + +Event 240 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.932490652975306e+02 -4.094504138983959e+01 -3.300190662632462e+02 4.912793227530680e+02 + 3 3.147487537014151e+02 3.081803657249564e+02 4.097350029662017e+01 -4.912038692507524e+01 + 4 5.920021810010545e+02 -2.672353243351168e+02 2.890455659666261e+02 -4.421589358279928e+02 + ME 2.294540625556830e-03 + +Event 241 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.438703186026562e+01 1.425431959717181e+01 -4.430288595443105e+00 -4.180186016371769e+01 + 3 7.139617398095608e+02 -8.415544716076501e+01 -5.657765076565166e+02 -4.272659242311072e+02 + 4 7.416512283301738e+02 6.990112756359289e+01 5.702067962519594e+02 4.690677843948251e+02 + ME 9.897863289217103e-03 + +Event 242 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.798759956195423e+02 -1.259218082844715e+02 -3.429343473884153e+02 1.041417477651927e+02 + 3 6.208895880511434e+02 5.354328139337264e+02 1.248673426784089e+02 -2.884852319370315e+02 + 4 4.992344163293142e+02 -4.095110056492549e+02 2.180670047100064e+02 1.843434841718388e+02 + ME 4.681565217176860e-05 + +Event 243 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.320641800899439e+02 1.658639294991472e+02 7.783463994856530e+01 1.424243988788333e+02 + 3 6.251485586341130e+02 -2.328139095298015e+02 -4.262931976140131e+02 3.935511574875349e+02 + 4 6.427872612759425e+02 6.694998003065480e+01 3.484585576654475e+02 -5.359755563663683e+02 + ME 1.088174374801640e-02 + +Event 244 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.609991843787813e+02 -2.293678857540617e+02 -4.971623496474938e+02 -3.703240376037023e+02 + 3 1.091403980947071e+02 1.154537470975927e+01 -9.115666825632124e+00 -1.081445118228680e+02 + 4 7.298604175265122e+02 2.178225110443025e+02 5.062780164731259e+02 4.784685494265703e+02 + ME 2.108318833367259e-03 + +Event 245 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.893629130846660e+02 -3.546974954177181e+02 3.112856868655738e+02 -1.294873298810981e+02 + 3 7.129026631852472e+02 5.703735458058532e+02 -4.257115617679145e+02 -4.091322034012453e+01 + 4 2.977344237300872e+02 -2.156760503881351e+02 1.144258749023407e+02 1.704005502212232e+02 + ME 3.089966081110584e-05 + +Event 246 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.999457395350199e+02 9.605025124341066e+01 9.072234098128426e+01 3.774922524438974e+02 + 3 3.675469088581874e+02 -1.615841482674672e+01 2.570183669846762e+02 2.622426259669195e+02 + 4 7.325073516067926e+02 -7.989183641666395e+01 -3.477407079659604e+02 -6.397348784108170e+02 + ME 1.318572178119925e-01 + +Event 247 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.711864521923226e+02 3.763073240556692e+02 5.338170415278108e+02 1.546719678644905e+02 + 3 5.231557804938882e+02 -1.057595517177888e+02 -5.121603131388773e+02 -1.409615302513522e+01 + 4 3.056577673137891e+02 -2.705477723378804e+02 -2.165672838893370e+01 -1.405758148393554e+02 + ME 2.832635912870236e-04 + +Event 248 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.307803946875937e+02 -6.240065811552295e+01 -3.654556314590156e+02 5.103256270499047e+02 + 3 3.935347424219227e+02 -2.188782290807617e+02 2.916853933646317e+01 -3.257470040392325e+02 + 4 4.756848628904837e+02 2.812788871962846e+02 3.362870921225527e+02 -1.845786230106721e+02 + ME 2.300626369478844e-04 + +Event 249 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.326970760901860e+02 -4.070406664121579e+02 -1.467447404863359e+02 3.261392852829556e+00 + 3 4.839435229991530e+02 2.335311811831336e+01 2.018595963184923e+02 -4.392136936630268e+02 + 4 5.833594009106612e+02 3.836875482938445e+02 -5.511485583215643e+01 4.359523008101971e+02 + ME 7.949848217937372e-05 + +Event 250 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.010671671345862e+02 -6.122994886156979e+02 -2.473946684860859e+02 2.353303785738851e+02 + 3 5.574643785654461e+02 3.902114201641946e+02 2.260985614407800e+02 -3.276904354069721e+02 + 4 2.414684542999682e+02 2.220880684515034e+02 2.129610704530567e+01 9.236005683308701e+01 + ME 4.001179096186077e-05 + +Event 251 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.364006127103795e+02 5.379960890463808e+02 4.302640987755425e+02 2.602285070392759e+02 + 3 3.051282143252570e+01 -2.901685968644104e+00 1.337962970917706e+01 -2.726899336532026e+01 + 4 7.330865658570955e+02 -5.350944030777370e+02 -4.436437284847198e+02 -2.329595136739561e+02 + ME 8.214352127951258e-03 + +Event 252 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.965625584838614e+02 -7.369842915522102e+01 -5.671364104158781e+02 -1.697401534860145e+02 + 3 6.549338760881152e+02 -1.514014639568436e+02 6.313240788068731e+02 8.628954906696531e+01 + 4 2.485035654280237e+02 2.250998931120648e+02 -6.418766839099476e+01 8.345060441904933e+01 + ME 4.193497354253246e-05 + +Event 253 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.728678540484714e+02 3.212236187283236e+01 -4.622666283104808e+02 -3.368312580807653e+02 + 3 7.160302400837320e+02 1.132435775281999e+02 5.206369974620781e+02 4.783433011307397e+02 + 4 2.111019058677967e+02 -1.453659394010323e+02 -5.837036915159722e+01 -1.415120430499744e+02 + ME 1.260323318561321e-03 + +Event 254 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.579357369440609e+02 1.333150067790222e+02 -6.785864805882140e+01 5.375077668373273e+02 + 3 6.202682598689536e+02 -4.039338689731095e+02 2.012068793592835e+02 -4.255419314189537e+02 + 4 3.217960031869853e+02 2.706188621940872e+02 -1.333482313004621e+02 -1.119658354183736e+02 + ME 6.303494480277278e-04 + +Event 255 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.263612771087841e+02 3.396063850675521e+02 -6.401091575508393e+02 5.028393902637346e+01 + 3 1.540578578981474e+02 -3.080387127739227e+01 1.060177193258910e+02 -1.074485378375538e+02 + 4 6.195808649930683e+02 -3.088025137901597e+02 5.340914382249483e+02 5.716459881118024e+01 + ME 1.409623960863276e-04 + diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gux_ttxux.txt b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gux_ttxux.txt new file mode 100644 index 0000000000..bd6fa59493 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/test/ref/dump_CPUTest.Sigma_MSSM_SLHA2_gux_ttxux.txt @@ -0,0 +1,4096 @@ +Event 0 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.647483690509011e+02 7.527657265342381e+01 -2.528976247704283e+02 -2.163164141117315e+01 + 3 6.252973211776936e+02 -5.721080498766041e+02 -1.578766990348905e+01 2.518727230515587e+02 + 4 6.099543097714056e+02 4.968314772231802e+02 2.686852946739174e+02 -2.302410816403857e+02 + ME 3.515102921254821e-04 + +Event 1 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.542827954151951e+02 1.482213322085297e+02 -1.988618298139057e+02 -5.607271498295619e+01 + 3 6.883656117507994e+02 1.265478873489434e+02 5.602777828023584e+02 3.793700749224231e+02 + 4 5.573515928340057e+02 -2.747692195574731e+02 -3.614159529884527e+02 -3.232973599394666e+02 + ME 7.281038654007379e-04 + +Event 2 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.301460683791101e+02 -3.656995432079240e+02 -2.257802895903974e+02 -1.768459985405174e+01 + 3 5.058528987551352e+02 2.755467101243707e+02 -2.034821274188550e+02 3.722313656043858e+02 + 4 5.640010328657552e+02 9.015283308355326e+01 4.292624170092524e+02 -3.545467657503341e+02 + ME 8.168247654499103e-04 + +Event 3 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.758793342627306e+02 1.455349847705337e+02 4.360940220328824e+02 -4.954335945799966e+02 + 3 3.008019460079605e+02 -1.607139834787174e+02 2.732727402256846e+01 2.527964523704278e+02 + 4 5.233187197293092e+02 1.517899870818368e+01 -4.634212960554508e+02 2.426371422095687e+02 + ME 7.786951583906058e-05 + +Event 4 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.540811678028369e+02 5.414642718170588e+01 -3.497885023717100e+02 -9.467915537920083e+00 + 3 7.415000547748699e+02 1.453779348794835e+00 7.277337852109663e+02 1.422102514562808e+02 + 4 4.044187774222939e+02 -5.560020653050046e+01 -3.779452828392566e+02 -1.327423359183604e+02 + ME 2.019556978652450e-04 + +Event 5 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.747467875786875e+02 2.462969907607520e+02 3.713870243947702e+02 1.636886763636383e+02 + 3 3.438196236093863e+02 -2.056491112573935e+02 2.636029701703988e+02 8.021128807897369e+01 + 4 6.814335888119256e+02 -4.064787950335842e+01 -6.349899945651691e+02 -2.438999644426124e+02 + ME 6.190232325432416e-04 + +Event 6 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.623951200922343e+02 4.644673798421034e+02 3.089047820108763e+02 -7.166700647426805e+01 + 3 2.268243199894468e+02 1.761899852590787e+02 -7.114332369064562e+01 -1.238748914321566e+02 + 4 7.107805599183189e+02 -6.406573651011822e+02 -2.377614583202307e+02 1.955418979064247e+02 + ME 8.401930442090062e-04 + +Event 7 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.922243378496302e+02 2.878585072835455e+02 -1.441537488072182e+02 -3.723465794939189e+02 + 3 2.873990637609374e+02 -5.400981623596619e+01 -8.913204919452848e+01 -2.678369642286231e+02 + 4 7.203765983894325e+02 -2.338486910475794e+02 2.332857980017467e+02 6.401835437225419e+02 + ME 2.053506126349072e-03 + +Event 8 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.353309706037128e+02 -7.529439061162444e+01 -4.917829145606098e+01 -3.230466069128648e+02 + 3 7.169322705461503e+02 -1.597426278178965e+02 -1.460012137440142e+01 6.987567601563110e+02 + 4 4.477367588501367e+02 2.350370184295209e+02 6.377841283046253e+01 -3.757101532434461e+02 + ME 5.201286437913765e-03 + +Event 9 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.557626120875720e+02 2.000882245504951e+02 -5.276260741790070e+01 -1.503174088272976e+02 + 3 7.044202058180884e+02 -6.969679478438196e+02 -1.019614549623776e+02 6.882422911146141e+00 + 4 5.398171820943396e+02 4.968797232933244e+02 1.547240623802783e+02 1.434349859161516e+02 + ME 6.512222805562601e-05 + +Event 10 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.466796552973451e+02 1.172124288883391e+02 -1.804077050554744e+02 2.718475489457261e+02 + 3 5.174471655316497e+02 -1.610456139025785e+02 -4.497410659869823e+02 -1.988689340353917e+02 + 4 6.358731791710056e+02 4.383318501423927e+01 6.301487710424566e+02 -7.297861491033446e+01 + ME 2.125918815883772e-04 + +Event 11 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.730783827248505e+02 -3.059484875398851e+01 3.466457017175527e+02 -4.553235612803232e+02 + 3 4.410994673708889e+02 -3.026218886155177e+02 -1.990641070399050e+01 3.203005892260323e+02 + 4 4.858221499042605e+02 3.332167373695060e+02 -3.267392910135625e+02 1.350229720542916e+02 + ME 5.149703709932931e-05 + +Event 12 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.275003875859171e+02 -1.247450244086003e+02 1.654605359856639e+02 9.390376067217456e+01 + 3 6.138170466352969e+02 3.363961838598331e+02 -2.139358085817026e+01 5.129827374509639e+02 + 4 6.586825657787861e+02 -2.116511594512328e+02 -1.440669551274935e+02 -6.068864981231385e+02 + ME 5.286122778520148e-02 + +Event 13 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.867684047377951e+02 7.055192702127013e+01 -2.028354730671930e+02 1.900429278217245e+02 + 3 6.990707050557397e+02 -5.605742285334718e+02 2.413419117565431e+02 -3.408965629057133e+02 + 4 5.141608902064656e+02 4.900223015122018e+02 -3.850643868935018e+01 1.508536350839886e+02 + ME 6.455516493173928e-05 + +Event 14 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.551549262960331e+02 1.090410064132905e+02 3.205839746298527e+02 1.071027348074892e+02 + 3 5.276349775014137e+02 3.895763694332612e+02 -2.529209653865598e+02 2.503196099590424e+02 + 4 6.172100962025532e+02 -4.986173758465519e+02 -6.766300924329286e+01 -3.574223447665316e+02 + ME 7.466790602956014e-04 + +Event 15 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.846731991828424e+02 7.106081559720656e+01 3.900476102503053e+02 4.297161529048977e+02 + 3 2.829885923647301e+02 -2.767806781033228e+02 5.223342094943638e+01 -2.732525156618248e+01 + 4 6.323382084524277e+02 2.057198625061163e+02 -4.422810311997417e+02 -4.023909013387151e+02 + ME 1.263735162017419e-03 + +Event 16 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.471577506095513e+02 1.666056475215675e+02 -5.784682380714991e+02 -4.425627187781377e+02 + 3 6.589296733908155e+02 -1.235441202519037e+02 5.251239647671504e+02 3.783780998595694e+02 + 4 9.391257599963079e+01 -4.306152726966399e+01 5.334427330434853e+01 6.418461891856477e+01 + ME 5.536805121346023e-05 + +Event 17 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.567490993131759e+02 3.856364495163705e+01 -1.708845728849434e+02 -3.107752047682323e+02 + 3 6.453207560475681e+02 4.468356462873770e+02 2.282834847349607e+02 4.057874246326636e+02 + 4 4.979301446392561e+02 -4.853992912390144e+02 -5.739891185001712e+01 -9.501221986443127e+01 + ME 1.330824442141138e-04 + +Event 18 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.856701782481425e+02 2.509110753153842e+02 -3.498523763974107e+02 -2.247720379690151e+02 + 3 3.014847498930009e+02 -1.059425909901355e+02 -2.435847754696140e+02 -1.426032222348426e+02 + 4 7.128450718588564e+02 -1.449684843252488e+02 5.934371518670247e+02 3.673752602038576e+02 + ME 1.020387128757764e-03 + +Event 19 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.848213503304407e+02 -3.141116763848333e+02 -1.950442390378233e+02 4.531088295091878e+02 + 3 5.769300027107225e+02 5.020221748138873e+02 2.252239828724832e+02 -1.734823378963535e+02 + 4 3.382486469588368e+02 -1.879104984290540e+02 -3.017974383465995e+01 -2.796264916128346e+02 + ME 4.274816590330906e-03 + +Event 20 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.550938429889906e+02 -4.478597170519694e+02 -1.958065402362923e+02 -2.630791652090858e+02 + 3 5.585686897587656e+02 3.351111310173187e+02 -1.360174455686904e+02 4.256744830831254e+02 + 4 3.863374672522434e+02 1.127485860346507e+02 3.318239858049827e+02 -1.625953178740396e+02 + ME 2.775098160851745e-04 + +Event 21 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.296556563991995e+02 -3.477135312394777e+02 -1.376147989324513e+02 -5.065804111325868e+02 + 3 3.137568007204202e+02 1.080474571851863e+02 -2.382188236683312e+02 1.732653140250679e+02 + 4 5.565875428803803e+02 2.396660740542913e+02 3.758336226007825e+02 3.333150971075188e+02 + ME 5.538930311163402e-05 + +Event 22 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.583338925767157e+02 2.471586228668331e+02 -1.597599499756147e+02 -4.744745610949309e+02 + 3 5.378723432497914e+02 9.149532098241647e+00 4.314513680009924e+02 3.210493120152683e+02 + 4 4.037937641734918e+02 -2.563081549650743e+02 -2.716914180253777e+02 1.534252490796626e+02 + ME 3.713249723823132e-05 + +Event 23 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.057340011976823e+02 6.848115528115159e+01 -5.207204912425279e+02 -3.017849923015606e+02 + 3 6.884459352783618e+02 -2.949639632364768e+01 6.680977958792450e+02 1.635026102131438e+02 + 4 2.058200635239559e+02 -3.898475895750392e+01 -1.473773046367171e+02 1.382823820884168e+02 + ME 2.948680174230038e-05 + +Event 24 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.702316790647312e+02 -1.210575128627593e+02 4.313728504035304e+02 -1.427598490831809e+02 + 3 7.180482366151730e+02 1.040047389253586e+02 -7.104588047260975e+02 4.956931953573400e+00 + 4 3.117200843200958e+02 1.705277393740067e+01 2.790859543225672e+02 1.378029171296075e+02 + ME 3.150283868438323e-05 + +Event 25 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.261365010744015e+02 -5.354018140499276e+02 -2.095559720530078e+02 2.479477970595020e+02 + 3 5.483958991041942e+02 5.199465180092641e+02 -9.843995208133502e+01 -1.438862620216537e+02 + 4 3.254675998214044e+02 1.545529604066344e+01 3.079959241343431e+02 -1.040615350378483e+02 + ME 1.660640019878967e-04 + +Event 26 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.635816356180675e+02 1.904702824079147e+02 -2.351549941335565e+02 -3.511853259118595e+02 + 3 3.686385821486526e+02 -2.712527815845713e+02 -6.015354190959190e+01 -2.422764621809818e+02 + 4 6.677797822332795e+02 8.078249917665664e+01 2.953085360431485e+02 5.934617880928414e+02 + ME 3.259676238066714e-04 + +Event 27 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.851713673150520e+02 1.387976072955998e+02 1.520424011317634e+02 -1.973348453858079e+02 + 3 6.747356481771329e+02 2.426633222154767e+02 -4.300238522839811e+02 4.598501858640580e+02 + 4 5.400929845078149e+02 -3.814609295110765e+02 2.779814511522176e+02 -2.625153404782502e+02 + ME 4.168139084318387e-04 + +Event 28 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.977804200471009e+02 -1.803202618401224e+02 -8.082809162516924e+01 -8.277519444290682e+00 + 3 7.197523834069630e+02 3.152541965091956e+02 6.467033971658864e+02 -2.080867841663850e+01 + 4 5.824671965459365e+02 -1.349339346690732e+02 -5.658753055407170e+02 2.908619786092892e+01 + ME 1.177873916973783e-04 + +Event 29 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.123364628491765e+02 -3.746492624245139e+02 3.785128791537566e+02 -3.021950929683376e+02 + 3 4.056577755659300e+02 1.796205570313495e+00 -8.781658530568644e+01 3.960344074293251e+02 + 4 4.820057615848937e+02 3.728530568542006e+02 -2.906962938480702e+02 -9.383931446098750e+01 + ME 5.458827395615220e-04 + +Event 30 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.349194950356051e+02 7.241679607953655e+02 1.425637322816714e+01 1.244354634469207e+02 + 3 7.321421454671269e+02 -7.253765693071589e+02 -2.895970851972086e+01 -9.498573130653320e+01 + 4 3.293835949726733e+01 1.208608511793151e+00 1.470333529155410e+01 -2.944973214038765e+01 + ME 5.154261794538042e-02 + +Event 31 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.718338270585457e+02 -1.344914872264095e+02 -1.021614404532311e+02 3.165350011824393e+01 + 3 6.313115253715935e+02 -2.849940593920691e+02 -7.916450257599642e+01 -5.577325610990745e+02 + 4 6.968546475698608e+02 4.194855466184786e+02 1.813259430292276e+02 5.260790609808306e+02 + ME 4.659721382036548e-04 + +Event 32 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.235176898898733e+02 -4.762113006241283e+02 -2.880822916693120e+01 5.439400065022984e+02 + 3 6.603902828461303e+02 4.672103814637362e+02 1.031050210016799e+02 -4.551913221650266e+02 + 4 1.160920272639969e+02 9.000919160391994e+00 -7.429679183474865e+01 -8.874868433727180e+01 + ME 4.479912373700687e-03 + +Event 33 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.786737271642295e+02 2.009638309376701e+02 4.090184839380261e+02 1.464443769121514e+02 + 3 3.795793219608412e+02 -6.057523839522326e+00 -8.244277697544295e+01 3.704685635647953e+02 + 4 6.417469508749324e+02 -1.949063070981499e+02 -3.265757069625828e+02 -5.169129404769462e+02 + ME 1.347708798358061e-02 + +Event 34 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.621583515140109e+02 -5.051303032557108e+02 -1.429543729176959e+02 4.035605363216953e+02 + 3 3.008522892707525e+02 8.677543723835063e+01 2.726747894692539e+02 -9.290092916351111e+01 + 4 5.369893592152367e+02 4.183548660173603e+02 -1.297204165515579e+02 -3.106596071581844e+02 + ME 6.480754796569888e-04 + +Event 35 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.158114977149371e+02 2.502256147979830e+02 4.233348779616201e+00 5.626659943296694e+02 + 3 1.476397433483021e+02 -1.670550278282843e+01 -6.055370982200890e+01 1.336101351676488e+02 + 4 7.365487589367604e+02 -2.335201120151546e+02 5.632036104239268e+01 -6.962761294973183e+02 + ME 2.128521537817902e+00 + +Event 36 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.182456511154909e+02 -7.463771462544149e+01 -6.667773110518938e+02 2.563475070450520e+02 + 3 4.860008755751821e+02 -7.840660561780857e+01 4.141081959217036e+02 -2.419992919944375e+02 + 4 2.957534733093265e+02 1.530443202432501e+02 2.526691151301903e+02 -1.434821505061439e+01 + ME 9.654932671123303e-05 + +Event 37 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.672182018814331e+02 -2.031706828392723e+00 -5.267408190306547e+02 2.104197478372324e+02 + 3 4.664069288608284e+02 3.712365792892206e+02 2.604523782658950e+02 -1.090109358856581e+02 + 4 4.663748692577388e+02 -3.692048724608279e+02 2.662884407647598e+02 -1.014088119515744e+02 + ME 1.219980965527749e-04 + +Event 38 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.068057345787184e+02 4.883513201966849e+02 -7.570036138649979e+01 -1.124032737511800e+02 + 3 3.871140338254016e+02 -1.153787089711744e+02 -3.599073977747532e+02 -8.373585688177310e+01 + 4 6.060802315958794e+02 -3.729726112255106e+02 4.356077591612531e+02 1.961391306329531e+02 + ME 1.008873452940717e-04 + +Event 39 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.960337392567769e+02 -3.669089247616476e+02 2.651961920161228e+02 -2.027271347192069e+02 + 3 2.837821967046824e+02 -2.822567153069604e+02 -2.935613327724534e+01 -1.303560381865560e+00 + 4 7.201840640385411e+02 6.491656400686079e+02 -2.358400587388775e+02 2.040306951010725e+02 + ME 1.377484700297836e-03 + +Event 40 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.080730228651937e+02 -3.065830270999448e+02 -2.484308296331463e+01 1.728167064871203e+01 + 3 6.842346640746096e+02 4.630487823766367e+02 8.554554725666559e+01 -4.964321303112498e+02 + 4 5.076923130601963e+02 -1.564657552766919e+02 -6.070246429335082e+01 4.791504596625379e+02 + ME 4.222110575527899e-05 + +Event 41 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.602650851118221e+02 -1.258781096038287e+02 -9.817642232798531e+01 1.417706342452912e+01 + 3 7.146392966623014e+02 6.799675591776853e+02 -1.019163870176435e+02 1.948499239342933e+02 + 4 6.250956182258764e+02 -5.540894495738563e+02 2.000928093456288e+02 -2.090269873588226e+02 + ME 4.548693916986712e-04 + +Event 42 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.687893235969910e+02 1.289401357197518e+02 4.788693514682045e+01 9.783209393213438e+01 + 3 7.042017295435161e+02 -1.022058447296739e+02 -6.640064324330017e+02 -2.110675220936915e+02 + 4 6.270089468594927e+02 -2.673429099007782e+01 6.161194972861812e+02 1.132354281615571e+02 + ME 1.699328969695654e-04 + +Event 43 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.729783670130408e+02 -7.983817933050126e+01 9.052957805204312e+01 4.573169538528310e+02 + 3 5.638402597824536e+02 4.785250044669658e+02 7.435095949863266e+01 -2.887933404236804e+02 + 4 4.631813732045056e+02 -3.986868251364647e+02 -1.648805375506758e+02 -1.685236134291506e+02 + ME 5.918894129186977e-04 + +Event 44 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.774791104122977e+02 -1.952605982635783e+01 6.371003613266311e+01 1.644949814321786e+02 + 3 7.194816205691245e+02 -3.678871192485065e+02 2.644831693887217e+01 -6.177486190667771e+02 + 4 6.030392690185776e+02 3.874131790748644e+02 -9.015835307153534e+01 4.532536376345984e+02 + ME 2.086667222834367e-04 + +Event 45 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.477488480180841e+02 -3.787655987618923e+02 1.634662296474455e+02 6.236535517992065e+02 + 3 7.458113398274103e+02 3.819163358711198e+02 -1.661042992235261e+02 -6.186952632673017e+02 + 4 6.439812154506047e+00 -3.150737109227506e+00 2.638069576080606e+00 -4.958288531904773e+00 + ME 9.379174202597218e-02 + +Event 46 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.243146757688279e+02 -4.392587631431587e+00 -2.496903827548322e+02 -2.069188895501946e+02 + 3 5.341608950426614e+02 -2.704482657861201e+02 2.711825143656835e+02 -3.723515022507137e+02 + 4 6.415244291885106e+02 2.748408534175518e+02 -2.149213161085116e+01 5.792703918009084e+02 + ME 1.885255707322850e-04 + +Event 47 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.742198761450970e+02 -3.282965096491569e+02 5.301803926793565e+02 -2.563251730900703e+02 + 3 6.484148720042494e+02 3.527030795571957e+02 -3.975273148506380e+02 3.715029176935213e+02 + 4 1.773652518506535e+02 -2.440656990803884e+01 -1.326530778287185e+02 -1.151777446034509e+02 + ME 1.138468449162823e-03 + +Event 48 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.321401810535270e+02 -1.843482647928687e+02 4.412348098999295e+02 5.543976952635381e+02 + 3 7.293058265076229e+02 2.182722651304251e+02 -4.435200216702997e+02 -5.362221528717154e+02 + 4 3.855399243885008e+01 -3.392400033755636e+01 2.285211770370228e+00 -1.817554239182278e+01 + ME 2.279938590427001e-03 + +Event 49 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.511117284856090e+02 -3.272266866652173e+02 5.199533974843239e+01 1.161835877338140e+02 + 3 7.326526490901412e+02 6.615045961628415e+02 -2.993354007364775e+02 -9.792799058578565e+01 + 4 4.162356224242500e+02 -3.342779094976241e+02 2.473400609880451e+02 -1.825559714802838e+01 + ME 8.833191284340252e-05 + +Event 50 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.322170903075261e+02 2.740692406080843e+02 1.952596610981922e+01 -6.787095515302594e+02 + 3 3.078559130669523e+02 -1.663333363406682e+02 8.625456119089937e+01 2.442716420418761e+02 + 4 4.599269966255218e+02 -1.077359042674160e+02 -1.057805273007185e+02 4.344379094883834e+02 + ME 7.605971331114271e-05 + +Event 51 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.473696038265161e+02 -2.922314643158455e+02 -6.759614889845236e+01 -1.752060888796554e+02 + 3 5.389399151999500e+02 -2.449040872454050e+02 9.346474502284559e+01 4.708954891311221e+02 + 4 6.136904809735342e+02 5.371355515612505e+02 -2.586859612439322e+01 -2.956894002514666e+02 + ME 4.725932497293742e-04 + +Event 52 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.818614816439092e+02 5.970116833066722e+02 3.013730734325877e+02 1.329902280423528e+02 + 3 2.108623144448949e+02 -4.198344769951677e+00 -1.698802183673394e+02 -1.248439063859964e+02 + 4 6.072762039111955e+02 -5.928133385367207e+02 -1.314928550652483e+02 -8.146321656356342e+00 + ME 1.641663059100063e-04 + +Event 53 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.157714002491656e+02 -5.140718537651751e+02 -4.182413977701254e+01 1.003899065692042e+00 + 3 5.148181840855221e+02 2.868792199999327e+02 1.974924151010656e+02 3.791237552236646e+02 + 4 4.694104156653124e+02 2.271926337652422e+02 -1.556682753240530e+02 -3.801276542893567e+02 + ME 3.192045950410474e-03 + +Event 54 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.433410767101755e+02 2.586883950027282e+02 -5.809813083922763e+02 9.710187728524585e+01 + 3 6.928799734080566e+02 -1.579832568796112e+02 6.405510983559769e+02 -2.117031848853748e+02 + 4 1.637789498817686e+02 -1.007051381231171e+02 -5.956978996370076e+01 1.146013076001289e+02 + ME 3.283447211651567e-05 + +Event 55 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.193759752058201e+02 -3.536444481659258e+02 -7.212523476050659e+01 -6.222823703878202e+02 + 3 5.307053661742267e+02 2.409461639849982e+02 1.900944302490854e+02 4.329633233142391e+02 + 4 2.499186586199529e+02 1.126982841809279e+02 -1.179691954885788e+02 1.893190470735813e+02 + ME 3.947074140758790e-05 + +Event 56 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.858864959547015e+02 1.815174721437793e+02 3.218581876578407e+02 -1.112074732396182e+02 + 3 4.484505297447189e+02 -3.244105157450005e+02 2.934585578803474e+02 -9.873079412811626e+01 + 4 6.656629743005794e+02 1.428930436012212e+02 -6.153167455381879e+02 2.099382673677345e+02 + ME 2.331514916593817e-04 + +Event 57 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.284589752749191e+02 3.868194647882292e+02 -1.709996888155516e+02 3.168575336559792e+02 + 3 6.299868555278972e+02 -1.587414880613578e+02 2.327134172236621e+02 -5.634971548731003e+02 + 4 3.415541691971833e+02 -2.280779767268714e+02 -6.171372840811039e+01 2.466396212171209e+02 + ME 3.478087926442680e-05 + +Event 58 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.172037319760957e+02 -2.246119436411400e+02 -2.286037628748728e+01 5.744278237820342e+02 + 3 5.117934503257735e+02 1.262762853074207e+02 3.215736628881853e+02 -3.775939815489577e+02 + 4 3.710028176981306e+02 9.833565833371921e+01 -2.987132866006979e+02 -1.968338422330765e+02 + ME 6.188959124461504e-04 + +Event 59 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.388935626701858e+02 -3.912134623809441e+02 -5.457789630286015e+02 3.082872805076099e+02 + 3 1.936051438730608e+02 1.561492575196544e+02 8.304673385628061e+01 -7.876294246644987e+01 + 4 5.675012934567535e+02 2.350642048612896e+02 4.627322291723209e+02 -2.295243380411600e+02 + ME 4.128060597493498e-04 + +Event 60 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.258141426633659e+02 -5.584991156701968e+02 1.635894950857984e+02 4.337319270970709e+02 + 3 2.789580074371136e+02 2.331554478032953e+02 6.512410160032128e+01 -1.386180308029247e+02 + 4 4.952278498995201e+02 3.253436678669015e+02 -2.287135966861195e+02 -2.951138962941461e+02 + ME 7.310848418576032e-04 + +Event 61 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.906141202026897e+02 4.485275282318680e+02 -2.043613424290570e+02 3.253990429020988e+02 + 3 4.163572165237975e+02 -4.021600557528675e+02 -4.112755461437413e+01 9.964509802161204e+01 + 4 4.930286632735124e+02 -4.636747247900049e+01 2.454888970434311e+02 -4.250441409237108e+02 + ME 5.864153159526680e-03 + +Event 62 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.346180891175762e+02 3.693463141798367e+02 7.549194961263062e+01 -6.305140780380819e+02 + 3 4.420621433230785e+02 -2.806743363126464e+02 3.467380983154045e+01 3.397625382625571e+02 + 4 3.233197675593453e+02 -8.867197786719018e+01 -1.101657594441711e+02 2.907515397755249e+02 + ME 3.975852881354506e-05 + +Event 63 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.451039732729312e+02 -2.415045377667665e+02 1.990362537024482e+02 -5.641092662620229e+02 + 3 3.260870385294104e+02 2.061141051805975e+02 -2.496695602716584e+02 3.892098426606745e+01 + 4 5.288089881976583e+02 3.539043258616898e+01 5.063330656921013e+01 5.251882819959554e+02 + ME 4.849703889620977e-04 + +Event 64 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.275973380665292e+02 -6.064553482667328e+01 4.309976929667101e+02 -2.981980196075213e+02 + 3 5.799838776791828e+02 3.279821268626862e+02 -1.824214634122377e+02 4.421893627315650e+02 + 4 3.924187842542881e+02 -2.673365920360130e+02 -2.485762295544724e+02 -1.439913431240437e+02 + ME 2.181033568069889e-04 + +Event 65 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.480172869826540e+02 2.720879118036236e+02 -5.153900904044359e+02 -2.833154199679406e+02 + 3 7.075023253568394e+02 -3.440299289242928e+02 4.709796137500282e+02 4.004761563708322e+02 + 4 1.444803876605064e+02 7.194201712066916e+01 4.441047665440793e+01 -1.171607364028916e+02 + ME 4.996070965373265e-03 + +Event 66 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.472978185025793e+02 4.857452785131265e+02 -2.223654169683453e+02 -1.189119332799752e+02 + 3 3.203062148499982e+02 1.169702135976477e+02 2.922172461416276e+02 -5.935588816501104e+01 + 4 6.323959666474223e+02 -6.027154921107742e+02 -6.985182917328221e+01 1.782678214449862e+02 + ME 1.350431107807246e-04 + +Event 67 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.264671493042945e+02 1.195959046886509e+02 -2.647539231733029e+02 3.122121220929445e+02 + 3 5.059969655247560e+02 3.777175441887565e+02 -7.608313561896590e+00 -3.366073372596323e+02 + 4 5.675358851709478e+02 -4.973134488774076e+02 2.723622367351999e+02 2.439521516668852e+01 + ME 9.805248085114752e-05 + +Event 68 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.996105691520875e+02 -3.814725562071958e+02 -3.417794545715574e+02 3.117664637712125e+02 + 3 2.164196744806214e+02 1.292759463548889e+02 -1.184749651041616e+02 1.268419798013014e+02 + 4 6.839697563672919e+02 2.521966098523067e+02 4.602544196757190e+02 -4.386084435725138e+02 + ME 2.964497825738480e-03 + +Event 69 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.950546755511077e+02 -1.873718558932053e+02 -4.578972175289679e+02 -1.735101101888632e+01 + 3 4.768584394819692e+02 -1.830244097668608e+02 2.985566003539792e+02 -3.236664843936508e+02 + 4 5.280868849669231e+02 3.703962656600662e+02 1.593406171749887e+02 3.410174954125370e+02 + ME 5.240005601642205e-05 + +Event 70 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.918343395272258e+02 6.895733556028865e+02 -5.391072441382606e+01 -1.473005040127906e+01 + 3 2.169590284692678e+02 -1.127375202028747e+02 1.807969800614662e+02 4.091361110301506e+01 + 4 5.912066320035063e+02 -5.768358354000119e+02 -1.268862556476402e+02 -2.618356070173603e+01 + ME 1.599438296512021e-04 + +Event 71 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.156371334918730e+02 1.547202099034306e+02 -4.807172487652236e+02 1.041836686949964e+02 + 3 3.718518305526426e+02 -8.969821893462726e+01 -7.521366892975189e+01 -3.529460545344468e+02 + 4 6.125110359554843e+02 -6.502199096880338e+01 5.559309176949757e+02 2.487623858394504e+02 + ME 1.126807746276446e-04 + +Event 72 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.110577464974889e+02 5.009520239746098e+01 -1.453533690489527e+02 -1.445968227848547e+02 + 3 7.317124633441163e+02 -4.429659627226336e+02 5.264774879404380e+02 2.490095170354977e+02 + 4 5.572297901583944e+02 3.928707603251725e+02 -3.811241188914850e+02 -1.044126942506430e+02 + ME 1.827405909657921e-04 + +Event 73 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.932257450488245e+02 3.105005764664298e+01 -2.932679039283982e+02 2.601082794045340e+02 + 3 5.658879124646471e+02 3.645905401293643e+02 4.244364556305354e+02 8.459646951004228e+01 + 4 5.408863424865280e+02 -3.956405977760073e+02 -1.311685517021372e+02 -3.447047489145762e+02 + ME 8.993028721661693e-04 + +Event 74 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.374854102925440e+02 7.785209805930555e+01 4.289805712042689e+01 1.048858692406466e+02 + 3 6.381281910764945e+02 -1.004137270491618e+02 -1.591026937267357e+02 6.097630724433484e+02 + 4 7.243863986309617e+02 2.256162898985645e+01 1.162046366063089e+02 -7.146489416839951e+02 + ME 1.411586672653399e+01 + +Event 75 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.936883054156938e+02 -3.438525101293580e+00 -2.706855443967301e+02 5.283780053968293e+02 + 3 5.912298912592890e+02 1.109657062166288e+02 4.832067437414102e+02 -3.221034603433170e+02 + 4 3.150818033250173e+02 -1.075271811153352e+02 -2.125211993446804e+02 -2.062745450535123e+02 + ME 1.380780220839166e-03 + +Event 76 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.619486867997674e+02 2.801967015359573e+01 2.136411519593738e+02 6.258980909300585e+02 + 3 1.201252731414032e+02 2.274423842261747e+01 -8.754996679960183e+01 7.904292618103446e+01 + 4 7.179260400588299e+02 -5.076390857621330e+01 -1.260911851597719e+02 -7.049410171110932e+02 + ME 5.935698698094391e+00 + +Event 77 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.456676259451604e+02 -7.346624001550108e+02 6.511229493320700e+01 -1.097804865615983e+02 + 3 1.284204120828029e+02 1.251494694834492e+02 2.867183268690426e+01 2.708973588335758e+00 + 4 6.259119619720370e+02 6.095129306715618e+02 -9.378412762011116e+01 1.070715129732624e+02 + ME 1.673867008207173e-04 + +Event 78 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.040158920877625e+02 6.911264613612160e+02 -6.659640240533207e+01 -1.163937709034253e+02 + 3 5.185438503615325e+02 -4.976050220224221e+02 -1.270913363611936e+02 7.158742227342900e+01 + 4 2.774402575507043e+02 -1.935214393387938e+02 1.936877387665258e+02 4.480634862999637e+01 + ME 5.336814072731610e-05 + +Event 79 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.777589592768841e+02 1.742725197144059e+02 -4.776543849198210e+01 6.532264221831092e+02 + 3 5.725002211294491e+02 -1.786302554544233e+02 -1.627852110918317e+02 -5.189881598643107e+02 + 4 2.497408195936666e+02 4.357735740017461e+00 2.105506495838138e+02 -1.342382623187985e+02 + ME 9.164970190455331e-04 + +Event 80 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.240819586861880e+02 4.679310297228965e+02 -4.118464023828053e+02 -3.002304821964348e+01 + 3 6.688675489057649e+02 -5.494372353172420e+02 3.251429131208653e+02 1.994607943266771e+02 + 4 2.070504924080468e+02 8.150620559434545e+01 8.670348926194001e+01 -1.694377461070337e+02 + ME 3.581385563739766e-03 + +Event 81 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.198056748722776e+02 1.034797897616987e+02 -2.885605608993972e+02 4.197888462474007e+02 + 3 5.672098642055398e+02 -4.160331805498524e+02 2.087659545613753e+01 -3.849773895903518e+02 + 4 4.129844609221831e+02 3.125533907881537e+02 2.676839654432596e+02 -3.481145665704891e+01 + ME 1.021798343494535e-04 + +Event 82 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.057598609140512e+02 6.385349666266646e+01 -2.765433460911305e+01 1.936364870179370e+02 + 3 6.235840147705877e+02 4.654039114453889e+02 -3.828889383639961e+02 -1.601633028106900e+02 + 4 6.706561243153623e+02 -5.292574081080551e+02 4.105432729731098e+02 -3.347318420724695e+01 + ME 6.937665415201322e-04 + +Event 83 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.583322583736492e+02 1.865539504254553e+02 -1.926584839569474e+02 6.012334775737429e+02 + 3 3.620902826842561e+02 -3.107067244571256e+02 -1.177956631152976e+01 -1.855584705935048e+02 + 4 4.795774589420946e+02 1.241527740316703e+02 2.044380502684771e+02 -4.156750069802382e+02 + ME 8.401890238995615e-03 + +Event 84 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.849329564663157e+02 -2.622178945286149e+02 4.068620488841210e+02 -2.941124332559830e+01 + 3 4.737588937677758e+02 6.014532316188536e+01 -1.333934272225748e+02 4.505954095412365e+02 + 4 5.413081497659073e+02 2.020725713667294e+02 -2.734686216615458e+02 -4.211841662156386e+02 + ME 5.161145291816592e-03 + +Event 85 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.085742632080858e+02 -2.174614026040271e+02 -5.283468657604089e+02 -4.190914152061854e+02 + 3 5.315764222715956e+02 8.528530557199831e+00 3.820092234108130e+02 3.695533927738616e+02 + 4 2.598493145203189e+02 2.089328720468272e+02 1.463376423495959e+02 4.953802243232386e+01 + ME 6.350811792060042e-05 + +Event 86 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.724500140939190e+02 1.231518677708316e+02 -1.121928207497680e+01 1.201946443701656e+02 + 3 7.028475062724231e+02 -6.467096040851285e+01 -4.553168759141600e+02 -5.315061866629339e+02 + 4 6.247024796336580e+02 -5.848090736231880e+01 4.665361579891369e+02 4.113115422927684e+02 + ME 1.172746972924033e-04 + +Event 87 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.942099203196797e+02 -7.751148196958455e+01 -1.356691819650310e+02 -1.153400900745028e+02 + 3 7.314670447251598e+02 1.724617634710876e+02 7.020747158546046e+02 1.113196793791551e+02 + 4 5.743230349551608e+02 -9.495028150150301e+01 -5.664055338895736e+02 4.020410695347638e+00 + ME 1.240875138008425e-04 + +Event 88 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.382497024023744e+02 2.632142028760094e+02 -5.613974181649784e+02 1.513733956108635e+02 + 3 3.997044228265544e+02 -5.264940326118349e+01 3.435187961344461e+02 1.974500004195773e+02 + 4 4.620458747710724e+02 -2.105647996148253e+02 2.178786220305324e+02 -3.488233960304407e+02 + ME 1.869761458836694e-03 + +Event 89 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.419006640093282e+02 -8.677155154367875e+01 6.457545216231645e+01 -9.185046144153738e+01 + 3 7.131224514048052e+02 5.460003286026869e+02 -4.154556538506973e+02 -1.944836022569670e+02 + 4 6.449768845858667e+02 -4.592287770590081e+02 3.508802016883808e+02 2.863340636985044e+02 + ME 1.142129056044379e-04 + +Event 90 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.730615760623939e+02 -6.017783679015005e+01 -5.202921970507185e+02 -2.325386583054728e+02 + 3 5.389913703864468e+02 -6.302812531165209e+01 2.446311215742109e+02 4.761247390423042e+02 + 4 3.879470535511588e+02 1.232059621018019e+02 2.756610754765077e+02 -2.435860807368315e+02 + ME 1.096229299051017e-03 + +Event 91 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.546745139784348e+02 -1.470341619195493e+02 -1.726383255301703e+02 -3.940886669878754e+02 + 3 5.110976540119646e+02 -2.482119727393536e+02 -1.865817698532448e+02 4.059542728975802e+02 + 4 5.342278320096004e+02 3.952461346589030e+02 3.592200953834151e+02 -1.186560590970475e+01 + ME 8.836093926880085e-05 + +Event 92 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.683728375977241e+02 -1.148152650923627e+02 3.458291789782991e+02 5.603051703379153e+02 + 3 2.872567998557088e+02 1.635098024620329e+02 7.847331657016400e+01 -2.227620976482501e+02 + 4 5.443703625465666e+02 -4.869453736967034e+01 -4.243024955484631e+02 -3.375430726896653e+02 + ME 8.302876705662708e-04 + +Event 93 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.666948073002088e+02 5.408074886689032e+01 5.639942928586390e+02 -1.134525653745258e+01 + 3 6.168025492529713e+02 2.439040545997395e+02 -5.541969602989467e+02 1.175666879272316e+02 + 4 3.165026434468199e+02 -2.979848034666298e+02 -9.797332559692304e+00 -1.062214313897791e+02 + ME 1.668148287200345e-04 + +Event 94 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.964349376711386e+02 8.445930034540564e+01 -2.409007074648562e+02 -4.257712097695705e+02 + 3 5.660980232871289e+02 1.373833465612049e+02 5.210669225216058e+02 1.734417778711397e+02 + 4 4.374670390417325e+02 -2.218426469066105e+02 -2.801662150567495e+02 2.523294318984308e+02 + ME 3.436014770096700e-05 + +Event 95 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.117074025057369e+02 -3.227984571262281e+02 4.276971164854591e+02 -4.684055501468923e+02 + 3 1.264078228725326e+02 8.675876182178399e+01 5.074873328843476e+01 7.665781760618941e+01 + 4 6.618847746217319e+02 2.360396953044438e+02 -4.784458497738943e+02 3.917477325407025e+02 + ME 2.140891407038800e-04 + +Event 96 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.329769441659942e+02 -9.642859092211873e+01 6.903981466332599e+02 -2.265107649915409e+02 + 3 3.937873938465681e+02 -4.837693103302090e+01 -3.847118583018797e+02 6.873841850241250e+01 + 4 3.732356619874388e+02 1.448055219551397e+02 -3.056862883313803e+02 1.577723464891279e+02 + ME 3.481367294411194e-05 + +Event 97 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.394989963266854e+01 6.003767577498499e+00 -2.078495220615400e+01 2.616364312804199e+01 + 3 7.377311980366452e+02 -5.308290258162607e+02 4.681853362634530e+02 2.080152802450354e+02 + 4 7.283189023306865e+02 5.248252582387622e+02 -4.474003840572990e+02 -2.341789233730774e+02 + ME 2.075265497746822e-02 + +Event 98 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.496912687496081e+02 -2.485814905959505e+02 -5.435228288348339e-01 -2.350907922099247e+01 + 3 7.458289852530974e+02 7.373315781279123e+02 9.801365830907574e+01 -5.473885205171281e+01 + 4 5.044797459972944e+02 -4.887500875319617e+02 -9.747013548024090e+01 7.824793127270527e+01 + ME 6.821383260945018e-05 + +Event 99 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.698125854886769e+02 8.336002034290718e+01 8.774494220182724e+01 -1.191144253093525e+02 + 3 6.496622934125945e+02 5.714329899004553e+02 -6.230613627727956e+01 3.027265745152471e+02 + 4 6.805251210987283e+02 -6.547930102433625e+02 -2.543880592454770e+01 -1.836121492058946e+02 + ME 6.152563019991606e-04 + +Event 100 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.141460480129781e+02 -5.842473718080512e+02 -5.092222124447422e+01 1.823110095657221e+02 + 3 3.909476383151781e+02 2.539115798088024e+02 -2.930333502072385e+02 -5.000421191795164e+01 + 4 4.949063136718438e+02 3.303357919992488e+02 3.439555714517127e+02 -1.323067976477706e+02 + ME 1.554648645082860e-04 + +Event 101 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.469346538870476e+02 3.524232024688499e+02 -1.488240016505349e+02 -6.415299525912138e+02 + 3 6.502268999047171e+02 -2.777200960400716e+02 1.351761574712158e+02 5.721835160737410e+02 + 4 1.028384462082358e+02 -7.470310642877821e+01 1.364784417931911e+01 6.934643651747267e+01 + ME 1.079747936907453e-04 + +Event 102 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.426790432885583e+02 -3.141071077544728e+02 6.615000409077074e+02 1.238005738162371e+02 + 3 6.735764515788642e+01 -4.139700837311953e+00 -5.533298776898177e+01 -3.818606686673834e+01 + 4 6.899633115535552e+02 3.182468085917849e+02 -6.061670531387255e+02 -8.561450694949879e+01 + ME 6.339493051337617e-04 + +Event 103 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.837874798175255e+02 -2.731724972668680e+02 1.247027290420595e+02 -3.793103501549070e+02 + 3 4.466406321977811e+02 -2.904538080082218e+02 -1.536665846758872e+02 3.025078850172423e+02 + 4 5.695718879846933e+02 5.636263052750896e+02 2.896385563382781e+01 7.680246513766477e+01 + ME 8.176368118734783e-05 + +Event 104 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.788466572679498e+02 3.572346730226224e+02 -3.682137844992379e+02 2.680773207965347e+02 + 3 2.925711988065158e+02 2.155069407513812e+02 1.697995838195863e+02 -1.016010147279926e+02 + 4 6.285821439255348e+02 -5.727416137740034e+02 1.984142006796517e+02 -1.664763060685422e+02 + ME 2.862966675495748e-04 + +Event 105 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.361125455083113e+02 2.619004058447622e+02 4.338373361330957e+01 -2.061496357605195e+02 + 3 5.299016201311088e+02 2.892532450564946e+02 2.091058919093095e+02 3.916669672191839e+02 + 4 6.339858343605798e+02 -5.511536509012566e+02 -2.524896255226191e+02 -1.855173314586643e+02 + ME 2.885324252695820e-04 + +Event 106 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.578050478863485e+02 -2.265838270225943e+02 2.740910124726658e+02 -3.947579646386072e+01 + 3 5.202885196186892e+02 1.412729374205232e+02 1.631578432376887e+02 4.734148487210871e+02 + 4 6.219064324949621e+02 8.531088960207101e+01 -4.372488557103545e+02 -4.339390522572265e+02 + ME 1.924896468469497e-03 + +Event 107 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.409822745993890e+02 9.278463733038998e+01 5.102180459532772e+02 -1.540466750365500e+02 + 3 2.501852297905710e+02 1.682301834486208e+02 1.474652503315490e+02 1.120056004263085e+02 + 4 7.088324956100399e+02 -2.610148207790107e+02 -6.576832962848262e+02 4.204107461024152e+01 + ME 7.147674511319288e-04 + +Event 108 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.835202199428555e+02 6.670011709444186e+02 6.653656309718585e+01 1.337243986739828e+02 + 3 2.377887385005082e+02 -1.098327419601477e+02 7.667443498831059e+01 -1.964720946353502e+02 + 4 5.786910415566365e+02 -5.571684289842709e+02 -1.432109980854965e+02 6.274769596136723e+01 + ME 1.145686033371949e-04 + +Event 109 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.978180281189351e+02 4.291222314737004e+02 2.249703559956600e+02 3.501840146583367e+02 + 3 3.585061336071062e+02 -3.227227650115257e+02 1.541688059097761e+02 2.467071262824851e+01 + 4 5.436758382739590e+02 -1.063994664621748e+02 -3.791391619054360e+02 -3.748547272865852e+02 + ME 1.163571884284121e-03 + +Event 110 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.073952645543156e+01 -4.753982451958468e+01 4.872856968801237e+01 -1.922426029646691e+01 + 3 7.438039776014969e+02 1.707202332282495e+02 -7.225114374584515e+02 4.556513803361385e+01 + 4 6.854564959430718e+02 -1.231804087086648e+02 6.737828677704391e+02 -2.634087773714689e+01 + ME 5.227309252777117e-04 + +Event 111 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.206822291802365e+02 -3.873336848644894e+02 2.415505427333673e+02 -2.504714268307115e+02 + 3 5.478000561519709e+02 4.687653961676167e+02 -2.245690260344170e+02 -1.729527606656598e+02 + 4 4.315177146677930e+02 -8.143171130312749e+01 -1.698151669895030e+01 4.234241874963712e+02 + ME 1.044891130451639e-04 + +Event 112 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.610471238372960e+02 2.563298943277285e+02 9.635756626046441e+01 -2.352981732387216e+02 + 3 6.139063356201011e+02 1.031778254919422e+02 -4.257030126280928e+02 4.301305270271112e+02 + 4 5.250465405426033e+02 -3.595077198196709e+02 3.293454463676283e+02 -1.948323537883896e+02 + ME 2.341641310883516e-04 + +Event 113 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.886653054136124e+02 3.035646198144377e+02 3.278619896967805e+02 -3.832517176826292e+02 + 3 5.420023902452333e+02 -3.658357535838290e+02 -3.990519958595696e+02 2.623541560166928e+01 + 4 3.693323043411537e+02 6.227113376939163e+01 7.119000616278893e+01 3.570163020809600e+02 + ME 6.920492747191044e-05 + +Event 114 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.165204340356855e+02 2.346362244736888e+01 6.298471388966840e+00 5.159487827839334e+02 + 3 5.932916594323345e+02 3.608814360715945e+02 -5.336137507463695e+01 -4.678804824963537e+02 + 4 3.901879065319799e+02 -3.843450585189634e+02 4.706290368567026e+01 -4.806830028757967e+01 + ME 5.272494855174562e-04 + +Event 115 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.432307281524775e+02 2.250327918244370e+02 4.870559856477669e+02 -8.506664127290338e+01 + 3 4.265243530840494e+02 2.057819224248363e+02 -2.472237669715339e+02 2.801021835354204e+02 + 4 5.302449187634724e+02 -4.308147142492733e+02 -2.398322186762331e+02 -1.950355422625171e+02 + ME 2.374600115510699e-04 + +Event 116 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.402635748890413e+02 -4.240500842615081e+02 -5.733358735035191e+01 -1.035683405941509e+02 + 3 4.399967684638557e+02 1.183617589007454e+02 -1.041572505293867e+02 -4.107784286579766e+02 + 4 6.197396566471035e+02 3.056883253607625e+02 1.614908378797389e+02 5.143467692521278e+02 + ME 1.345352413007975e-04 + +Event 117 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.074085311587982e+02 -4.270248480828711e+01 -3.034838508096459e+02 2.395944736750828e+01 + 3 5.360984061023379e+02 3.510554986169303e+02 -1.596589010508530e+02 -3.723849798683070e+02 + 4 6.564930627388640e+02 -3.083530138086433e+02 4.631427518604987e+02 3.484255325007987e+02 + ME 1.799583689554380e-04 + +Event 118 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.403602961735895e+02 4.471526113902057e+02 -1.804334130868148e+02 -2.439007487679596e+02 + 3 5.654623567965704e+02 -5.534570111367971e+02 -1.157195831079004e+02 6.480112868522362e+00 + 4 3.941773470298400e+02 1.063043997465926e+02 2.961529961947154e+02 2.374206358994369e+02 + ME 3.058951494327095e-05 + +Event 119 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 8.009099446659006e+01 5.775399043490317e+01 -2.629604726664822e+01 4.886268393818208e+01 + 3 7.131140611332346e+02 2.472685400460708e+02 -2.870014097539109e+02 -6.041689532644715e+02 + 4 7.067949444001754e+02 -3.050225304809738e+02 3.132974570205592e+02 5.553062693262893e+02 + ME 6.918127089212584e-04 + +Event 120 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.007248873753321e+02 2.708997263130530e+02 -3.880896283797751e+02 1.634784128397387e+02 + 3 7.413897277398675e+02 -4.257033276374028e+02 5.921425482134987e+02 -1.334264135464211e+02 + 4 2.578853848848011e+02 1.548036013243502e+02 -2.040529198337238e+02 -3.005199929331748e+01 + ME 1.036722475487658e-04 + +Event 121 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.732265116821120e+02 -1.149395375629033e+02 4.260916136383034e+02 3.658189076403450e+02 + 3 4.323948798659246e+02 -2.148488009071912e+01 -4.178027098651984e+02 1.092914804138530e+02 + 4 4.943786084519636e+02 1.364244176536225e+02 -8.288903773105277e+00 -4.751103880541979e+02 + ME 8.100493993931536e-02 + +Event 122 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.423360304412702e+02 2.648046119434483e+02 2.369247279710451e+01 -2.156644197927059e+02 + 3 6.059487982275790e+02 2.457729689670163e+01 -4.569077875801422e+02 3.972469964635579e+02 + 4 5.517151713311509e+02 -2.893819088401499e+02 4.332153147830377e+02 -1.815825766708520e+02 + ME 2.188082775378800e-04 + +Event 123 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.430133297276667e+02 -4.205671322284506e+01 3.498095937953862e+01 1.321377229770997e+02 + 3 7.140350670908592e+02 -2.955397919833875e+01 -6.570980288365158e+02 -2.778395577453973e+02 + 4 6.429516031814726e+02 7.161069242118353e+01 6.221170694569765e+02 1.457018347682965e+02 + ME 5.612523360524351e-04 + +Event 124 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.053457283343441e+02 5.458657819531910e+02 -1.853964251366731e+01 -2.610177782464908e+02 + 3 7.499633671623128e+02 -6.784114238502394e+02 2.145325921506609e+01 3.189713933003629e+02 + 4 1.446909045033435e+02 1.325456418970486e+02 -2.913616701398675e+00 -5.795361505387171e+01 + ME 4.173279537865917e-04 + +Event 125 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.695439244882122e+02 9.058534244088483e+01 6.586171675820721e+02 7.941529525294386e+01 + 3 9.341516463500352e+01 3.490868167113007e+01 5.232133368429144e+01 6.906703243419068e+01 + 4 7.370409108767839e+02 -1.254940241120153e+02 -7.109385012663632e+02 -1.484823276871339e+02 + ME 1.130040599959981e-02 + +Event 126 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.465564354211969e+02 -2.094351601488127e+02 -1.930091683601272e+02 -5.804477571728034e+02 + 3 1.356182567235448e+02 -2.832094442380729e+01 9.735247446175228e+01 -9.007070211700794e+01 + 4 7.178253078552585e+02 2.377561045726200e+02 9.565669389837490e+01 6.705184592898115e+02 + ME 1.794899322427402e-03 + +Event 127 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.508388003927652e+02 -3.846405138087859e+02 7.756355374444067e+01 2.220162025777267e+02 + 3 6.162879941073577e+02 2.174727303224461e+02 1.334711143222092e+02 -5.609830344035004e+02 + 4 4.328732054998775e+02 1.671677834863399e+02 -2.110346680666500e+02 3.389668318257735e+02 + ME 3.926299991369257e-05 + +Event 128 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.468963146802860e+02 5.701805835528933e+02 -3.440982003215340e+02 -3.381488363986430e+02 + 3 1.196664332518720e+02 -9.337643239636880e+01 2.398139841985227e+01 7.089280393650263e+01 + 4 6.334372520678422e+02 -4.768041511565245e+02 3.201168019016818e+02 2.672560324621405e+02 + ME 2.071729357507356e-04 + +Event 129 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.378966182438206e+02 -4.256397208622688e+02 4.624364030548156e+01 9.190104474357972e+01 + 3 7.127537996732576e+02 5.790589826349545e+02 -1.369827771626341e+02 -3.923574802896586e+02 + 4 3.493495820829216e+02 -1.534192617726859e+02 9.073913685715252e+01 3.004564355460789e+02 + ME 1.674829186367344e-05 + +Event 130 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.322026526626456e+02 5.905875735566585e+02 -2.387291116192750e+01 -2.243136110600485e+02 + 3 5.268087771404593e+02 -3.287250458747471e+02 1.913681034684307e+02 3.644798771698753e+02 + 4 3.409885701968954e+02 -2.618625276819114e+02 -1.674951923065031e+02 -1.401662661098268e+02 + ME 2.773399201658380e-04 + +Event 131 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.691964685177018e+02 -2.641651354044939e+02 4.065264362900751e+01 -3.210735842607329e+01 + 3 5.382709487855665e+02 -3.022535437819009e+02 -4.307865739991412e+02 1.131429946566680e+02 + 4 6.925325826967321e+02 5.664186791863948e+02 3.901339303701334e+02 -8.103563623059483e+01 + ME 5.387060593362983e-04 + +Event 132 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.376388194981168e+02 -2.491804956023666e+01 3.114513197621116e+01 1.317327453336230e+02 + 3 7.332494677489979e+02 -3.054807357444666e+02 -6.882601889638179e+00 -6.665500220046780e+02 + 4 6.291117127528853e+02 3.303987853047033e+02 -2.426253008657300e+01 5.348172766710550e+02 + ME 3.600239248651741e-04 + +Event 133 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.818916885738672e+02 -3.437736592641007e+02 -2.113522447259726e+02 -4.192228966514222e+02 + 3 7.075583625851592e+02 3.695171106849944e+02 9.875952986414086e+01 5.952667441040354e+02 + 4 2.105499488409736e+02 -2.574345142089369e+01 1.125927148618317e+02 -1.760438474526132e+02 + ME 6.650550993609504e-03 + +Event 134 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.039051474789597e+02 -1.767404282002262e+02 5.832845063404939e+02 3.521710697233706e+02 + 3 6.740856043500102e+02 9.540039380435485e+01 -5.203258634262523e+02 -4.177932056695244e+02 + 4 1.220092481710302e+02 8.134003439587137e+01 -6.295864291424152e+01 6.562213594615409e+01 + ME 6.398457409951176e-05 + +Event 135 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.491379873081084e+02 -6.603965492909806e+02 -9.243924572685609e+01 -3.413782470545816e+02 + 3 4.360367703469753e+02 3.763875731093294e+02 3.833030381995055e+01 2.167746473012021e+02 + 4 3.148252423449159e+02 2.840089761816512e+02 5.410894190690560e+01 1.246035997533796e+02 + ME 3.740544797653714e-05 + +Event 136 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.907976432034610e+02 -8.965778913807026e+01 -5.375684903631193e+02 -4.244796613161183e+02 + 3 4.317447428217262e+02 2.541758793770707e+02 2.501815833403359e+02 2.433255445990286e+02 + 4 3.774576139748128e+02 -1.645180902390004e+02 2.873869070227833e+02 1.811541167170898e+02 + ME 3.305598294827775e-05 + +Event 137 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.927917878715716e+02 -5.453882061843875e+02 -2.239274061847311e+02 6.172783069514816e+01 + 3 3.718333194205910e+02 2.859809174201714e+02 -2.363544177495510e+02 2.472896101988848e+01 + 4 5.353748927078368e+02 2.594072887642159e+02 4.602818239342820e+02 -8.645679171503689e+01 + ME 1.271508885999853e-04 + +Event 138 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.164849493482387e+02 2.012854405109472e+01 -2.573298799707043e+01 -1.118096528381494e+02 + 3 7.481698498358139e+02 -1.044692284663333e+02 -4.003634472873117e+00 7.408294509656059e+02 + 4 6.353452008159477e+02 8.434068441523856e+01 2.973662246994371e+01 -6.290197981274564e+02 + ME 3.558981928721817e+00 + +Event 139 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.415587822283577e+02 -2.468214832259765e+02 1.926082427237748e+02 1.365416492148350e+02 + 3 5.828887331044928e+02 -1.023403009989268e+02 -5.561813319045077e+02 1.412376154306548e+02 + 4 5.755524846671491e+02 3.491617842249035e+02 3.635730891807333e+02 -2.777792646454897e+02 + ME 4.162244884049924e-04 + +Event 140 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.395392082109441e+02 -3.037880820376849e+02 -2.455930383243060e+02 -2.014735126343028e+02 + 3 4.709796125547877e+02 -2.826270024952004e+02 2.984919122515593e+02 2.298833426397907e+02 + 4 5.894811792342678e+02 5.864150845328854e+02 -5.289887392725339e+01 -2.840983000548779e+01 + ME 1.224866890305543e-04 + +Event 141 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.025838986653695e+02 -2.680006525137058e+02 -6.218827689980461e+01 -1.259574698062632e+02 + 3 5.104624598690774e+02 -2.829910827131053e+02 4.173533268753468e+02 -7.939880721102661e+01 + 4 6.869536414655532e+02 5.509917352268112e+02 -3.551650499755422e+02 2.053562770172897e+02 + ME 3.744997832087190e-04 + +Event 142 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.390011511178413e+02 -3.153925512561954e+02 3.992377088505193e+01 -3.027468279160259e+02 + 3 4.597282536099518e+02 2.984856708041211e+02 -2.221794712617382e+02 -2.699863960308454e+02 + 4 6.012705952722067e+02 1.690688045207420e+01 1.822557003766862e+02 5.727332239468712e+02 + ME 1.636931943634938e-04 + +Event 143 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.103308443495007e+02 -3.626595603160224e+02 2.462759922459803e+02 5.589240443825270e+02 + 3 3.424564807343298e+02 4.507572778536915e+01 -2.357842367637252e+02 -2.442343416788665e+02 + 4 4.472126749161696e+02 3.175838325306533e+02 -1.049175548225538e+01 -3.146897027036604e+02 + ME 1.307103870169449e-03 + +Event 144 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.893886390440550e+02 -2.470805413393664e+02 1.331686162420118e+02 6.296618309717111e+02 + 3 7.132719020730981e+02 2.482972988978648e+02 -2.304803220538649e+02 -6.276815106349291e+02 + 4 9.733945888284487e+01 -1.216757558499173e+00 9.731170581185297e+01 -1.980320336781243e+00 + ME 3.763795443785492e-04 + +Event 145 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.784954309743687e+02 2.391836032855265e+02 1.115572896135231e+01 -2.931305935912624e+02 + 3 7.389406222827197e+02 -4.231861417520661e+02 1.513250860114714e+02 5.865555822189356e+02 + 4 3.825639467429115e+02 1.840025384665395e+02 -1.624808149728235e+02 -2.934249886276729e+02 + ME 2.197770375396454e-03 + +Event 146 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.681255842987410e+02 -3.253195724522379e+01 1.754808059398437e+02 -4.327698247100132e+02 + 3 2.875849079819392e+02 2.091841587061404e+01 1.879781824316579e+02 -2.166372592748876e+02 + 4 7.442895077193195e+02 1.161354137460974e+01 -3.634589883715017e+02 6.494070839849009e+02 + ME 5.365428658770597e-02 + +Event 147 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.442136391928777e+02 -1.784444843977844e+02 -1.666832492802189e+02 -3.816014311599316e+00 + 3 5.551361515401285e+02 1.378338123621512e+02 -5.199472642306259e+02 1.372327560591401e+02 + 4 7.006502092669938e+02 4.061067203563306e+01 6.866305135108448e+02 -1.334167417475408e+02 + ME 7.506593451016740e-04 + +Event 148 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.547263863263726e+02 3.928375677411887e+02 5.145105706241225e+01 2.231759855356057e+02 + 3 7.397285466814292e+02 -5.611511356388266e+02 -1.533645573573770e+02 -4.569322031694095e+02 + 4 3.055450669921979e+02 1.683135678976379e+02 1.019135002949646e+02 2.337562176338038e+02 + ME 1.447744081607946e-05 + +Event 149 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.343018799311635e+02 9.853424545130945e+01 1.924850318874441e+02 -9.021023174733594e+01 + 3 7.291173748950660e+02 3.429747374294526e+01 -5.990516617369192e+02 4.142136359886766e+02 + 4 5.365807451737705e+02 -1.328317191942546e+02 4.065666298494750e+02 -3.240034042413406e+02 + ME 8.426746322823631e-04 + +Event 150 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.707648023587807e+02 -8.969278865174961e+01 -3.008719699078221e+02 3.507859183712496e+02 + 3 6.876639918976695e+02 3.906111988928598e+02 4.609284537794546e+02 -3.284046551871671e+02 + 4 3.415712057435500e+02 -3.009184102411105e+02 -1.600564838716326e+02 -2.238126318408256e+01 + ME 1.073269251843420e-04 + +Event 151 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.503034458278050e+02 -1.575298496674959e+02 -3.658248853789648e+01 -6.298735108350153e+02 + 3 6.998690336552311e+02 1.302751858829804e+02 -1.019415103826456e+02 6.800389464387811e+02 + 4 1.498275205169628e+02 2.725466378451583e+01 1.385239989205421e+02 -5.016543560376589e+01 + ME 6.640529478938758e-04 + +Event 152 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.401192382353394e+02 1.493701961830190e+02 6.288419447382045e+02 3.605867993093738e+02 + 3 7.332111095478889e+02 -1.230079111936445e+02 -6.287602831147090e+02 -3.565502647954900e+02 + 4 2.666965221677111e+01 -2.636228498937447e+01 -8.166162349551351e-02 -4.036534513883709e+00 + ME 8.454055708389634e-04 + +Event 153 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.645797071775899e+02 7.941901905692946e+01 3.691428696980725e+02 -4.197337333594241e+02 + 3 6.079979027943974e+02 1.021455738177839e+02 -5.566920170809548e+02 2.220849604771994e+02 + 4 3.274223900280123e+02 -1.815645928747133e+02 1.875491473828823e+02 1.976487728822249e+02 + ME 2.849866805380214e-05 + +Event 154 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.022174885419887e+02 -5.152457849782368e+02 -1.493252664732706e+02 -2.736597328082223e+02 + 3 3.617627670199851e+02 1.925398333816265e+02 -2.626238171638091e+02 1.575736108034646e+02 + 4 5.360197444380262e+02 3.227059515966102e+02 4.119490836370798e+02 1.160861220047577e+02 + ME 6.460441220928935e-05 + +Event 155 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.202229507100909e+02 -2.107861924791834e+02 -3.212541876154507e+02 4.868690137883070e+02 + 3 2.943040328093192e+02 2.940980302320594e+02 1.073731199058898e+01 2.433613089266564e+00 + 4 5.854730164805901e+02 -8.331183775287643e+01 3.105168756248618e+02 -4.893026268775735e+02 + ME 5.929895958203021e-03 + +Event 156 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.945486805149832e+02 4.540818864859257e+02 -1.431706201593250e+02 -1.337542944644701e+02 + 3 5.997303202813281e+02 -3.624214233270367e+02 -5.726286247273347e+01 4.743923835389624e+02 + 4 4.057209992036886e+02 -9.166046315888885e+01 2.004334826320584e+02 -3.406380890744924e+02 + ME 4.710987030939507e-03 + +Event 157 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.617003083190188e+02 3.118400043328062e+02 3.404502064148865e+02 -4.079626411035580e+00 + 3 5.720097526413111e+02 -4.999240316044800e+01 -4.329264075474301e+02 -3.705005295422581e+02 + 4 4.662899390396694e+02 -2.618476011723578e+02 9.247620113254364e+01 3.745801559532937e+02 + ME 3.912747406180316e-05 + +Event 158 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.784877363061536e+02 -5.707102180762959e+02 -3.102223423027389e+02 -1.959529373021938e+02 + 3 5.650909444059712e+02 5.525284805868615e+02 7.765167789879931e+01 8.950011457818250e+01 + 4 2.564213192878751e+02 1.818173748943441e+01 2.325706644039396e+02 1.064528227240113e+02 + ME 3.507923476335565e-05 + +Event 159 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.369491563274252e+02 2.154713482252002e+02 -2.912667909729743e+02 3.962955349875316e+02 + 3 6.066564496499102e+02 -4.020061311781470e+01 5.572389608252350e+02 -2.364332868806716e+02 + 4 3.563943940226648e+02 -1.752707351073854e+02 -2.659721698522608e+02 -1.598622481068599e+02 + ME 3.206050547648561e-04 + +Event 160 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.492474755438521e+02 3.490068395973683e+02 1.460348644657109e+02 -5.276270735801971e+02 + 3 2.857818814470014e+02 -2.550253586192556e+02 1.227259509083861e+02 3.964456076362121e+01 + 4 5.649706430091474e+02 -9.398148097811274e+01 -2.687608153740975e+02 4.879825128165766e+02 + ME 6.739487149316230e-05 + +Event 161 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.770282049439583e+02 -2.863253153105185e+02 -4.911270786072976e+02 -3.676672364525181e+02 + 3 1.598243093356544e+02 -7.505362471426162e+01 1.299195075310523e+02 -5.506073768810753e+01 + 4 6.631474857203876e+02 3.613789400247800e+02 3.612075710762454e+02 4.227279741406256e+02 + ME 1.585084358449273e-04 + +Event 162 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.178592782584630e+02 -3.271131571456630e+02 3.943743741889438e+02 -7.512700901574513e+01 + 3 3.730686930366257e+02 -2.885924195736569e+01 -1.360208443078026e+02 -3.461874113706257e+02 + 4 6.090720287049107e+02 3.559723991030290e+02 -2.583535298811413e+02 4.213144203863708e+02 + ME 1.033033803745270e-04 + +Event 163 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.388642316037673e+02 3.152159924116781e+02 3.539969933522671e+01 -4.356149670486711e+02 + 3 5.364171791816749e+02 -5.299694218906361e+02 3.369785517714305e+01 7.576448071880543e+01 + 4 4.247185892145582e+02 2.147534294789580e+02 -6.909755451236975e+01 3.598504863298658e+02 + ME 3.512719974074017e-05 + +Event 164 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.862697092177667e+02 4.132218376422068e+02 1.310202162324327e+02 -5.320221138485150e+02 + 3 4.476895523579006e+02 -2.769046850483522e+02 1.374187337517142e+02 3.238299280529300e+02 + 4 3.660407384243330e+02 -1.363171525938544e+02 -2.684389499841469e+02 2.081921857955847e+02 + ME 3.386495130720484e-05 + +Event 165 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.382444910715276e+02 -2.158277263671036e+02 -9.471372817531808e+00 -1.004446273032522e+02 + 3 7.304591383576045e+02 4.619003715882295e+02 -1.223345688256176e+02 5.524969256086772e+02 + 4 5.312963705708671e+02 -2.460726452211260e+02 1.318059416431495e+02 -4.520522983054249e+02 + ME 6.983271751192138e-03 + +Event 166 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.131352071380649e+02 -7.633553084455029e+01 -1.899581415396244e+02 5.929087379418958e+01 + 3 7.305557876753161e+02 8.980971292745940e+01 7.136333043711877e+02 1.279589045828712e+02 + 4 5.563090051866194e+02 -1.347418208290915e+01 -5.236751628315633e+02 -1.872497783770607e+02 + ME 3.327828037388997e-04 + +Event 167 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.122964103002419e+02 -3.405127102276982e+02 6.366431608201745e+01 2.235761145061386e+02 + 3 4.697083356610920e+02 -2.521100678451879e+02 -2.856113063438231e+01 -3.952855880214881e+02 + 4 6.179952540386658e+02 5.926227780728861e+02 -3.510318544763516e+01 1.717094735153495e+02 + ME 1.149968561789038e-04 + +Event 168 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.156643283953479e+02 -3.999734570317169e+02 4.816586825103863e+02 3.467009924560656e+02 + 3 6.192344221355603e+02 2.722545660880235e+02 -4.999454120042315e+02 -2.436869012025524e+02 + 4 1.651012494690918e+02 1.277188909436935e+02 1.828672949384506e+01 -1.030140912535133e+02 + ME 1.018839971797586e-03 + +Event 169 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.626022684949455e+02 7.511110909567984e+01 -2.030941161665286e+02 -2.908461902563517e+02 + 3 5.580565590514409e+02 -2.529981754432838e+02 -3.439969378312538e+02 3.592842232626200e+02 + 4 5.793411724536144e+02 1.778870663476035e+02 5.470910539977823e+02 -6.843803300626824e+01 + ME 1.377146483384387e-04 + +Event 170 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.602909342483499e+02 4.699653539595538e+02 -3.020118498241595e+02 3.520021683086901e+02 + 3 1.039297502933439e+02 3.247420585022844e+01 -9.851348423194941e+01 6.473976746580496e+00 + 4 7.357793154583059e+02 -5.024395598097821e+02 4.005253340561090e+02 -3.584761450552707e+02 + ME 1.702044161949012e-02 + +Event 171 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.506693011949598e+02 -3.657300520509279e+01 -1.244227366169959e+02 -7.669834565089050e+01 + 3 6.344013325830558e+02 -2.026333084464632e+02 -4.956100871165361e+02 3.402578943089166e+02 + 4 7.149293662219821e+02 2.392063136515565e+02 6.200328237335322e+02 -2.635595486580258e+02 + ME 2.154674315939935e-03 + +Event 172 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.352445157558219e+02 -2.018352690102653e+02 3.892440882325301e+02 -3.069825004886508e+02 + 3 6.716112180685399e+02 2.825227203806541e+02 -5.978593235713690e+02 1.175022124175020e+02 + 4 2.931442661756387e+02 -8.068745137038918e+01 2.086152353388394e+02 1.894802880711482e+02 + ME 2.632874420159238e-05 + +Event 173 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.571348515648596e+02 -2.769863586381786e+02 5.805753619381593e+02 1.343019708712702e+02 + 3 5.332990408103323e+02 1.871824832342877e+02 -4.782426732337678e+02 1.437168410371091e+02 + 4 3.095661076248081e+02 8.980387540389081e+01 -1.023326887043916e+02 -2.780188119083794e+02 + ME 1.000693559775717e-02 + +Event 174 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.091496911716728e+02 -4.752584064243671e+02 3.135726231883978e+01 -3.797492797588730e+02 + 3 6.417481529658016e+02 3.309293137608123e+02 9.015643604119191e+01 5.424004960996682e+02 + 4 2.491021558625255e+02 1.443290926635548e+02 -1.215136983600317e+02 -1.626512163407953e+02 + ME 1.320362630614722e-03 + +Event 175 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.399801778396888e+02 1.966672297646827e+02 2.343185748302534e+02 -4.449667388535756e+02 + 3 6.987953575798325e+02 -1.857207036318897e+02 -9.664246188148672e+01 6.666955876403316e+02 + 4 2.612244645804785e+02 -1.094652613279310e+01 -1.376761129487668e+02 -2.217288487867561e+02 + ME 9.518106311173663e-03 + +Event 176 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.615757321243974e+02 -4.129469954321283e+02 4.686878756164517e+02 -2.179194886871010e+02 + 3 1.607981401590111e+02 -6.355407199259609e+01 7.929314438200188e+00 1.474925346731048e+02 + 4 6.776261277165925e+02 4.765010674247243e+02 -4.766171900546520e+02 7.042695401399618e+01 + ME 6.985103203668638e-04 + +Event 177 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.314334067424885e+02 -3.493619040652742e+02 -2.026482683689240e+01 -2.523299055494342e+02 + 3 4.840006500668402e+02 -1.846595828310068e+02 -1.450727057198389e+02 4.232155216776995e+02 + 4 5.845659431906719e+02 5.340214868962810e+02 1.653375325567312e+02 -1.708856161282654e+02 + ME 2.176440078840027e-04 + +Event 178 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.528135981327367e+02 -2.544528544607913e+02 1.436928116455423e+02 3.458992272209776e+02 + 3 3.053350882587862e+02 -1.380299578048219e+02 2.072032295570572e+02 1.767599177741536e+02 + 4 7.418513136084765e+02 3.924828122656130e+02 -3.508960412025995e+02 -5.226591449951311e+02 + ME 7.527164759442256e-02 + +Event 179 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.433145319259943e+02 -2.538538580850882e+02 -6.778753511348521e+02 -1.689962142519080e+02 + 3 1.647945947160298e+02 1.009041857568576e+02 1.171651165877689e+02 5.699069397138987e+01 + 4 5.918908733579761e+02 1.529496723282306e+02 5.607102345470832e+02 1.120055202805181e+02 + ME 1.343898999291521e-04 + +Event 180 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.396120216689869e+02 1.204528233788652e+02 -1.081248155319048e+02 1.766750195544081e+02 + 3 5.541470271917009e+02 2.767127195685323e+02 2.999096875483203e+02 3.749175614572561e+02 + 4 7.062409511393136e+02 -3.971655429473977e+02 -1.917848720164147e+02 -5.515925810116631e+02 + ME 1.331904622353186e-02 + +Event 181 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.165494222755782e+02 1.336973493521793e+02 -1.495065670853883e+02 -8.164837697364385e+01 + 3 6.960869932595207e+02 -2.848973600545249e+02 2.209041937252092e+01 6.347303441548928e+02 + 4 5.873635844649011e+02 1.512000107023455e+02 1.274161477128675e+02 -5.530819671812490e+02 + ME 6.185927142892157e-02 + +Event 182 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.472681881349898e+02 4.279258056181361e+02 3.994050733201775e+02 -2.762448183472868e+02 + 3 5.337197582091034e+02 -3.479343829022644e+02 -4.034091782989213e+02 -3.254965992745415e+01 + 4 3.190120536559072e+02 -7.999142271587159e+01 4.004104978744047e+00 3.087944782747408e+02 + ME 6.408157605979969e-05 + +Event 183 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.165307808531155e+02 -3.276949594572818e+02 8.808524820164888e+01 -5.147496540405800e+02 + 3 2.975460412740736e+02 -1.030095950018340e+02 -2.375020297789283e+02 1.466814775843214e+02 + 4 5.859231778728107e+02 4.307045544591156e+02 1.494167815772794e+02 3.680681764562588e+02 + ME 6.910802494672606e-05 + +Event 184 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.645337360463251e+02 -3.940276919793660e+02 3.776398996283964e+02 1.443212503288767e+02 + 3 5.368100353438222e+02 2.392766596964612e+02 -1.719264331693737e+02 -4.487237410122138e+02 + 4 3.986562286098530e+02 1.547510322829050e+02 -2.057134664590229e+02 3.044024906833371e+02 + ME 3.556850559142083e-05 + +Event 185 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.347397779710934e+02 2.522092504724421e+02 -1.599825720327363e+02 5.600809373302327e+02 + 3 4.566768168089408e+02 -3.359958684022406e+02 -1.272903681003782e+02 -2.818823400219341e+02 + 4 4.085834052199662e+02 8.378661792979844e+01 2.872729401331146e+02 -2.781985973082987e+02 + ME 1.186460640924250e-03 + +Event 186 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.089823220133233e+02 -5.197119220861887e+02 4.248734840868308e+02 -2.281183322067745e+02 + 3 5.364076825758044e+02 3.588264146200085e+02 -3.973752875032956e+02 3.270606945152316e+01 + 4 2.546099954108726e+02 1.608855074661802e+02 -2.749819658353517e+01 1.954122627552516e+02 + ME 2.587882549194956e-05 + +Event 187 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.835105223217562e+02 -2.128653471696258e+02 1.375287019182911e+02 -4.117725407538515e+02 + 3 7.240136612790379e+02 4.407273454759851e+02 -4.896543389042275e+01 5.723264583716988e+02 + 4 2.924758163992054e+02 -2.278619983063593e+02 -8.856326802786832e+01 -1.605539176178473e+02 + ME 5.316681075235348e-04 + +Event 188 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.611118500396009e+02 3.502021063704276e+02 -2.011693879247277e+02 -5.234102027267808e+02 + 3 3.072944371702249e+02 -6.894916504330921e+01 -1.599953986835476e+02 2.531350551695447e+02 + 4 5.315937127901743e+02 -2.812529413271185e+02 3.611647866082753e+02 2.702751475572363e+02 + ME 6.895251729568341e-05 + +Event 189 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.498478362545704e+02 6.780504955298834e+02 -3.199144947524264e+02 -1.319162971889923e+01 + 3 3.253008430749361e+02 -2.985087551774363e+02 1.291384938207140e+02 6.034152914782609e+00 + 4 4.248513206704935e+02 -3.795417403524470e+02 1.907760009317124e+02 7.157476804116659e+00 + ME 8.602858507536172e-05 + +Event 190 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.938867893347995e+02 3.689671478502748e+02 -1.218724623869293e+02 3.048516153777389e+02 + 3 5.264063001598521e+02 6.631942569346465e+01 1.276367949726207e+02 -5.063735530147588e+02 + 4 4.797069105053494e+02 -4.352865735437401e+02 -5.764332585691415e+00 2.015219376370201e+02 + ME 4.767532488589928e-05 + +Event 191 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.681793141805986e+02 -3.225132888415707e+02 1.579589482507472e+02 -8.117977937027922e+01 + 3 5.431126642386394e+02 4.058413736814013e+01 9.147123993851423e+01 5.338139246166098e+02 + 4 5.887080215807622e+02 2.819291514734306e+02 -2.494301881892614e+02 -4.526341452463305e+02 + ME 4.903987113797056e-03 + +Event 192 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.054165399887860e+02 1.497087111729465e+02 8.905021611535379e+01 5.798159601983524e+02 + 3 2.106656439489221e+02 1.451894976721945e+02 -1.487249448604451e+02 3.436443048222167e+01 + 4 6.839178160622916e+02 -2.948982088451411e+02 5.967472874509132e+01 -6.141803906805740e+02 + ME 4.333474291796046e-02 + +Event 193 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.753169163933055e+02 -1.695475157411122e+02 -2.139406274107579e+02 3.581134319495643e+01 + 3 5.760219428901974e+02 -3.264616044953138e+02 1.527507522369444e+02 -4.493231656306969e+02 + 4 6.486611407164975e+02 4.960091202364259e+02 6.118987517381341e+01 4.135118224357404e+02 + ME 1.540826185869138e-04 + +Event 194 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.445934948105151e+02 -2.970257025567896e+02 -8.183019525038446e+01 1.543509890854414e+02 + 3 7.485441862377920e+02 6.623797851941251e+02 1.083400559332055e+02 -3.314119056355291e+02 + 4 4.068623189516927e+02 -3.653540826373358e+02 -2.650986068282092e+01 1.770609165500877e+02 + ME 3.038654939315978e-05 + +Event 195 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.012122274303650e+02 -5.190018365965079e+01 1.322177369426911e+02 -1.425173724194239e+02 + 3 7.122630330184552e+02 -3.054768058087830e+02 -2.528097616133815e+02 5.916838461125125e+02 + 4 5.865247395511842e+02 3.573769894684374e+02 1.205920246706905e+02 -4.491664736930889e+02 + ME 3.023501477748519e-03 + +Event 196 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.490485793345990e+02 3.485190427929747e+02 -2.661098616642628e+01 -2.819059396826193e+02 + 3 5.531554978829223e+02 -3.330165694254378e+02 4.416170126965179e+02 7.442003978758297e+00 + 4 4.977959227824787e+02 -1.550247336753695e+01 -4.150060265300916e+02 2.744639357038610e+02 + ME 4.345743284950935e-05 + +Event 197 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.951249254444254e+02 -2.278358800090240e+02 3.101157211704545e+02 -8.968142489336995e+01 + 3 3.607080640108545e+02 -2.889948719219028e+02 2.155030307719242e+02 -1.227661082778766e+01 + 4 7.441670105447205e+02 5.168307519309260e+02 -5.256187519423793e+02 1.019580357211576e+02 + ME 3.391439408056234e-02 + +Event 198 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.750236904637999e+02 1.183014344420310e+02 -1.005952209347265e+02 -3.413621838211424e+02 + 3 4.381296266085965e+02 -2.726825461625328e+02 1.003845461170282e+02 -3.279096546785175e+02 + 4 6.868466829276034e+02 1.543811117205018e+02 2.106748176981258e-01 6.692718384996599e+02 + ME 9.680264113292831e-04 + +Event 199 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.454478562244572e+02 -2.058455361543722e+02 -1.131056012155069e+02 -7.126982772660263e+01 + 3 5.321797086694488e+02 -9.806778012582419e+01 -4.820333037417012e+02 -2.030808875905193e+02 + 4 7.223724351060941e+02 3.039133162801963e+02 5.951389049572082e+02 2.743507153171220e+02 + ME 1.581505656689570e-03 + +Event 200 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.952431318363245e+02 3.031309873729304e+02 9.337877017948551e+01 2.358159092128122e+02 + 3 6.094031244332665e+02 -7.796753338981907e+01 -5.315426896439308e+02 -2.876727322709445e+02 + 4 4.953537437304095e+02 -2.251634539831113e+02 4.381639194644453e+02 5.185682305813225e+01 + ME 6.727154090869418e-05 + +Event 201 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.497938633639727e+02 3.771120671245743e+02 3.553445817627055e+02 -3.921081252746437e+02 + 3 3.369790646193911e+02 -2.140351778515324e+02 1.061239955238162e+02 2.376584318047305e+02 + 4 5.132270720166355e+02 -1.630768892730419e+02 -4.614685772865218e+02 1.544496934699135e+02 + ME 6.312958572761704e-05 + +Event 202 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.267802742470180e+02 6.523432021666289e+02 -1.481957728499301e+02 2.840702844913056e+02 + 3 3.546086620137576e+02 -3.102429173963679e+02 -5.939291787501398e+01 -1.611493614224695e+02 + 4 4.186110637392243e+02 -3.421002847702609e+02 2.075886907249440e+02 -1.229209230688361e+02 + ME 1.897072351939181e-04 + +Event 203 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.830190702985664e+02 2.789429895135887e+02 -3.943102945050297e+02 -4.197918611657844e+00 + 3 5.247163710833167e+02 -4.266462829986154e+02 3.263988520595893e+01 3.037019215942699e+02 + 4 4.922645586181172e+02 1.477032934850268e+02 3.616704092990706e+02 -2.995040029826120e+02 + ME 5.851957370778088e-04 + +Event 204 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.952375769935189e+02 3.823764713153297e+01 6.531840992713524e+02 -2.350397908115461e+02 + 3 6.250862947179036e+02 1.031861473443960e+02 -5.506835576815645e+02 2.771878679515999e+02 + 4 1.796761282885782e+02 -1.414237944759291e+02 -1.025005415897879e+02 -4.214807714005372e+01 + ME 1.805914193340070e-04 + +Event 205 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.625197268936786e+02 2.955060596751036e+02 4.395356105446072e+02 -1.895074112086702e+02 + 3 3.144813194259644e+02 -1.941101430078122e+02 -7.073026664887026e+00 -2.473251401357733e+02 + 4 6.229989536803573e+02 -1.013959166672914e+02 -4.324625838797200e+02 4.368325513444434e+02 + ME 1.141960698900670e-04 + +Event 206 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.487698581700872e+02 -4.771827558939671e+02 -2.639484985605369e+02 6.145050708573942e+01 + 3 4.357856725513921e+02 1.877155863290790e+02 1.701172104948723e+02 3.545872893148350e+02 + 4 5.154444692785203e+02 2.894671695648880e+02 9.383128806566407e+01 -4.160377964005747e+02 + ME 4.181894856678196e-03 + +Event 207 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.289473514933904e+02 -3.230637718239222e+02 -3.258094337294262e+02 2.631792409740627e+02 + 3 3.730441408755687e+02 -1.145152671243400e+02 -7.298530142052728e+01 -3.474497523579300e+02 + 4 5.980085076310412e+02 4.375790389482623e+02 3.987947351499535e+02 8.427051138386733e+01 + ME 1.164313051910223e-04 + +Event 208 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.144460531270951e+02 3.105028133645123e+02 -3.495125011961061e+01 3.525242310830971e+01 + 3 7.230517599976930e+02 -6.554206809343710e+02 2.220922910679197e+02 2.095294558946057e+02 + 4 4.625021868752115e+02 3.449178675698587e+02 -1.871410409483092e+02 -2.447818790029154e+02 + ME 4.870805375832175e-04 + +Event 209 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.827014058170526e+02 -6.682954863774691e+01 -1.958656753088386e+02 -1.925890275057887e+02 + 3 5.969812148172334e+02 5.625717004655274e+02 1.060136244597390e+02 -1.692949027847389e+02 + 4 6.203173793657135e+02 -4.957421518277806e+02 8.985205084909933e+01 3.618839302905276e+02 + ME 1.006214003685978e-04 + +Event 210 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.369223392964550e+02 -2.366581006943837e+02 8.850719545688517e+01 -2.228813191927022e+02 + 3 6.926279093100446e+02 9.835546321295953e+01 -1.581805884470998e+02 6.671120783270954e+02 + 4 4.704497513935005e+02 1.383026374814241e+02 6.967339299021459e+01 -4.442307591343932e+02 + ME 5.978096663280861e-02 + +Event 211 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.754314663824422e+02 -1.965408456680789e+02 -5.399725108422632e+02 3.037689947684008e+01 + 3 6.656941886103589e+02 4.112771407945243e+02 5.114655840792436e+02 1.113679599883347e+02 + 4 2.588743450071987e+02 -2.147362951264454e+02 2.850692676301958e+01 -1.417448594651748e+02 + ME 4.390225704479540e-04 + +Event 212 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.922157374848575e+02 8.073316194509509e+00 4.947261155542873e+02 -3.254233732830556e+02 + 3 3.635572903001510e+02 8.951663862813328e+01 4.011175755255380e+01 3.500738802669425e+02 + 4 5.442269722149915e+02 -9.758995482264277e+01 -5.348378731068407e+02 -2.465050698388703e+01 + ME 3.034959524264807e-04 + +Event 213 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.434820262506831e+02 2.991548764052632e+02 2.111623598614187e+02 -6.470566753063677e+02 + 3 5.607612173038239e+02 -2.664197873565703e+02 -1.905271140771769e+02 4.551626726109782e+02 + 4 1.957567564454930e+02 -3.273508904869271e+01 -2.063524578424195e+01 1.918940026953895e+02 + ME 1.831097412012331e-04 + +Event 214 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.400874280734794e+02 3.457358963402696e+02 2.445843697627679e+02 -3.351710101016578e+02 + 3 3.400793067879316e+02 1.482066942304564e+02 1.256466447865830e+02 2.791086371729012e+02 + 4 6.198332651385894e+02 -4.939425905707262e+02 -3.702310145493509e+02 5.606237292875652e+01 + ME 1.367090429786524e-04 + +Event 215 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.916345321859862e+02 3.271767110560380e+02 -1.945589530122144e+02 9.208594000107220e+01 + 3 6.136750729169615e+02 -1.269585669220027e+02 2.644680756040780e+02 -5.390132228350478e+02 + 4 4.946903948970534e+02 -2.002181441340350e+02 -6.990912259186327e+01 4.469272828339764e+02 + ME 6.216073146940013e-05 + +Event 216 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.767411090262153e+02 1.602503356822860e+02 2.758455349572532e+02 -2.004069210086422e+02 + 3 4.061922956351254e+02 3.340053729931860e+02 2.237650079776778e+02 5.798114391563541e+01 + 4 7.170665953386591e+02 -4.942557086754720e+02 -4.996105429349309e+02 1.424257770930067e+02 + ME 1.236322176421920e-03 + +Event 217 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.474118977458854e+02 -5.378641111590875e+02 -3.279650037002521e+02 1.492759847325320e+02 + 3 5.088298200539714e+02 3.261878344469131e+02 1.555821256186315e+02 -3.581947579501666e+02 + 4 3.437582822001434e+02 2.116762767121744e+02 1.723828780816206e+02 2.089187732176346e+02 + ME 3.359763963640064e-05 + +Event 218 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.658501161076261e+02 -6.577627036244853e+02 -3.020200479570948e+01 9.895676706252428e+01 + 3 2.516345839620714e+02 1.565221509782131e+02 -1.156477271957936e+02 1.595192254662914e+02 + 4 5.825152999303024e+02 5.012405526462722e+02 1.458497319915031e+02 -2.584759925288157e+02 + ME 5.987124496807696e-04 + +Event 219 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.328556070633436e+02 6.122246558068493e+01 -1.687441385117925e+02 3.938796795879555e+02 + 3 6.500677455605623e+02 -3.703058656885360e+02 4.356876543064814e+02 -3.092537914719427e+02 + 4 4.170766473760947e+02 3.090834001078510e+02 -2.669435157946889e+02 -8.462588811601289e+01 + ME 2.799995024816229e-04 + +Event 220 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.686297280598668e+02 -3.497113779929074e+02 -8.765282776369959e+01 7.685577594963361e+01 + 3 4.155522773953193e+02 -1.777404948015451e+02 -1.525848366500188e+02 3.432344379292751e+02 + 4 7.158179945448152e+02 5.274518727944525e+02 2.402376644137180e+02 -4.200902138789081e+02 + ME 3.535808409277698e-03 + +Event 221 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.295220830718469e+02 3.654688468413811e+01 4.204675060608333e+02 3.197890523886257e+02 + 3 7.127556392876786e+02 -1.727486268095863e+02 -4.342549693537606e+02 -5.381460163035254e+02 + 4 2.577222776404743e+02 1.362017421254481e+02 1.378746329292729e+01 2.183569639148998e+02 + ME 2.825415640707878e-05 + +Event 222 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.464305981122427e+02 -2.054199106396077e+02 6.127423271580307e+01 1.215572638876956e+02 + 3 6.926647117218595e+02 4.702892479611936e+02 3.872350261814336e+02 -3.296383785530530e+02 + 4 5.609046901658980e+02 -2.648693373215859e+02 -4.485092588972366e+02 2.080811146653574e+02 + ME 6.353331085740444e-05 + +Event 223 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.463384302181124e+02 -1.209251938955737e+02 -2.140981972257043e+02 -1.488897673935926e+01 + 3 6.819620845265061e+02 -2.400891875757810e+02 5.819023806457058e+02 2.623339210620683e+02 + 4 5.716994852553809e+02 3.610143814713547e+02 -3.678041834200016e+02 -2.474449443227090e+02 + ME 3.946653114295199e-04 + +Event 224 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.236851263016065e+02 -8.671871524968941e+01 1.717231909970331e+02 1.141317038679677e+02 + 3 5.308972974363860e+02 -3.715833295101987e+01 4.680039348616381e+02 2.478780257941054e+02 + 4 7.454175762620065e+02 1.238770482007101e+02 -6.397271258586711e+02 -3.620097296620725e+02 + ME 8.863362174401691e-02 + +Event 225 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.094176014319267e+02 1.569347096242780e+02 -1.561291130928883e+00 -4.846394040251012e+02 + 3 7.252311334449814e+02 -3.845161955462209e+02 -4.374219820797173e+01 6.133466494377277e+02 + 4 2.653512651230915e+02 2.275814859219426e+02 4.530348933890066e+01 -1.287072454126262e+02 + ME 3.983470118521901e-04 + +Event 226 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.863217264048350e+02 -2.391756120967483e+02 -6.171186323675804e+02 1.816511279850092e+02 + 3 5.332348374442744e+02 1.096335504493486e+02 4.112484130583279e+02 -3.212391931833644e+02 + 4 2.804434361508906e+02 1.295420616473995e+02 2.058702193092524e+02 1.395880651983551e+02 + ME 3.799953795834203e-05 + +Event 227 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.243206345463227e+02 -5.280189925476209e+02 -1.406011303275692e+02 4.754657162080069e+02 + 3 5.487499634657127e+02 3.840442912861270e+02 -1.353123555187441e+01 -3.917312987222201e+02 + 4 2.269294019879643e+02 1.439747012614939e+02 1.541323658794436e+02 -8.373441748578678e+01 + ME 2.907212401937304e-04 + +Event 228 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.119578664379945e+02 1.625437651479949e+01 -1.806612394559917e+02 1.096514885776142e+02 + 3 6.254097456672617e+02 -3.200704000326812e+01 3.158243706171928e+02 5.388579277416935e+02 + 4 6.626323878947439e+02 1.575266348846865e+01 -1.351631311612011e+02 -6.485094163193077e+02 + ME 9.010319691049223e-01 + +Event 229 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.921227120343663e+02 -3.877491982207574e+02 4.449193714386763e+02 -4.802726626309341e+01 + 3 4.688278331283220e+02 3.470549659129083e+02 -1.517581364471262e+02 -2.762641051115459e+02 + 4 4.390494548373112e+02 4.069423230784908e+01 -2.931612349915501e+02 3.242913713746393e+02 + ME 3.495772451463616e-05 + +Event 230 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.261952284727869e+02 2.153699775439378e+02 -1.171086083390751e+02 3.486312082969336e+02 + 3 3.540619701921574e+02 3.070144260847320e+01 1.307424531367546e+02 3.276029778648148e+02 + 4 7.197428013350558e+02 -2.460714201524110e+02 -1.363384479767958e+01 -6.762341861617484e+02 + ME 3.222589174262472e-01 + +Event 231 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.205236024420394e+02 7.533931576750221e+01 -3.260217181731272e+02 -2.547036061581323e+02 + 3 5.397543491930861e+02 8.423195081267899e+01 -1.158376015978276e+02 5.204050211049135e+02 + 4 5.397220483648742e+02 -1.595712665801811e+02 4.418593197709548e+02 -2.657014149467810e+02 + ME 5.555178667842419e-04 + +Event 232 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.295782852421119e+02 3.239064445356881e+02 9.240815775655157e-01 2.821724019337123e+02 + 3 7.183371274312137e+02 -6.155391061575078e+02 -1.955291718271078e+02 -3.144649112405859e+02 + 4 3.520845873266733e+02 2.916326616218200e+02 1.946050902495421e+02 3.229250930687325e+01 + ME 6.753821520363789e-05 + +Event 233 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.640046126075326e+02 -2.220120664068516e+02 -1.165482463207536e+02 2.638683509799470e+02 + 3 4.682121509308885e+02 -1.009786196736113e+02 3.762431872847592e+02 2.597441061312977e+02 + 4 6.677832364615792e+02 3.229906860804629e+02 -2.596949409640056e+02 -5.236124571112447e+02 + ME 5.431717621567728e-03 + +Event 234 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 8.690043548936441e+01 -2.607433849884744e+01 -7.258333015587985e+01 4.004341073848801e+01 + 3 6.785651905172676e+02 -3.574930335951373e+02 -4.725723606052792e+01 5.748184081539155e+02 + 4 7.345343739933678e+02 3.835673720939847e+02 1.198405662164077e+02 -6.148618188924036e+02 + ME 1.996277353471906e-01 + +Event 235 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.000566282865330e+02 1.219146462304111e+01 -2.126850238006026e+02 2.113064812540423e+02 + 3 7.160981218147419e+02 2.575873756248089e+02 2.779062108697768e+02 -6.076293293985469e+02 + 4 4.838452498987245e+02 -2.697788402478499e+02 -6.522118706917435e+01 3.963228481445046e+02 + ME 3.958736816695858e-05 + +Event 236 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.510518772182422e+02 -9.497518588910036e+01 1.467158067736534e+01 1.165380984781943e+02 + 3 6.955499852411461e+02 5.933480346078575e+02 3.495450158124773e+02 9.770452249822526e+01 + 4 6.533981375406115e+02 -4.983728487187571e+02 -3.642165964898427e+02 -2.142426209764196e+02 + ME 1.127760274384129e-03 + +Event 237 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.173874152942701e+02 2.069918593916189e+02 -3.850229167793931e+01 -5.412237993169356e+01 + 3 7.305677895866183e+02 -6.701932224704495e+02 -2.421540700080861e+02 1.610333695687662e+02 + 4 5.520447951191119e+02 4.632013630788306e+02 2.806563616860255e+02 -1.069109896370727e+02 + ME 1.827772116807501e-04 + +Event 238 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.349573912113934e+02 -3.336495545457479e+02 -4.785400196851593e+02 2.506956580500141e+02 + 3 5.768887318987100e+02 4.812119270965609e+02 2.334547330568690e+02 -2.161818165921042e+02 + 4 2.881538768898969e+02 -1.475623725508128e+02 2.450852866282899e+02 -3.451384145790984e+01 + ME 9.825725038230817e-05 + +Event 239 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.349076725903782e+02 -5.331874414268931e+02 1.887721601290928e+01 -3.848403846142781e+01 + 3 3.658437465440001e+02 8.335465236419728e+01 1.670818061666300e+01 -3.558292926602242e+02 + 4 5.992485808656212e+02 4.498327890626960e+02 -3.558539662957237e+01 3.943133311216517e+02 + ME 9.237747195986976e-05 + +Event 240 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.870582387324442e+02 1.830793600232297e+02 -1.562409872742485e+02 1.564389154054251e+02 + 3 6.007192677438852e+02 3.433229388031108e+02 4.688113613010561e+02 -1.523446941819631e+02 + 4 6.122224935236704e+02 -5.264022988263405e+02 -3.125703740268074e+02 -4.094221223462029e+00 + ME 1.432407961090023e-04 + +Event 241 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.424696267657398e+02 4.823783107714220e+02 2.498315161211406e+02 5.061190823507635e+02 + 3 2.455726236162736e+02 -1.827879695947951e+02 -1.199757723946156e+02 -1.118046764652876e+02 + 4 5.119577496179859e+02 -2.995903411766270e+02 -1.298557437265251e+02 -3.943144058854758e+02 + ME 2.711650879127002e-03 + +Event 242 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.249130370348900e+02 1.676828147928013e+02 6.059046362201675e+02 -3.609168279440811e+02 + 3 6.240672718074164e+02 -4.529413961306756e+01 -5.490982345027016e+02 2.930862151720546e+02 + 4 1.510196911576932e+02 -1.223886751797336e+02 -5.680640171746590e+01 6.783061277202636e+01 + ME 4.594938851627855e-05 + +Event 243 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.655090712555230e+02 2.096323612054770e+02 2.113490506800235e+02 3.578890153850057e+02 + 3 5.764797256412519e+02 6.697224883641853e+01 -5.382210340689440e+02 -1.953502251008744e+02 + 4 4.580112031032260e+02 -2.766046100418948e+02 3.268719833889207e+02 -1.625387902841315e+02 + ME 2.316824945814668e-04 + +Event 244 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.237109195354749e+02 1.305098338947756e+02 -4.868141165486322e+02 -1.423106687020528e+02 + 3 5.804450110242352e+02 -4.045654344879671e+02 2.643676733537771e+02 3.214855413949400e+02 + 4 3.958440694402901e+02 2.740556005931916e+02 2.224464431948551e+02 -1.791748726928872e+02 + ME 2.650535388587173e-04 + +Event 245 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.629169357520612e+02 2.457511487795889e+02 -4.402365929491729e+01 -8.242333044139184e+01 + 3 6.931386101565748e+02 -5.195573187661655e+02 4.004017488088275e+02 -2.240084037645317e+02 + 4 5.439444540913644e+02 2.738061699865766e+02 -3.563780895139104e+02 3.064317342059234e+02 + ME 4.301255099855837e-05 + +Event 246 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.300937687157436e+02 -5.459948028041551e+02 3.085954426748103e+02 6.063567799240774e+01 + 3 1.673910408536142e+02 -3.546130270298914e+01 7.662824936562286e+01 -1.445350060290698e+02 + 4 7.025151904306418e+02 5.814561055071443e+02 -3.852236920404333e+02 8.389932803666211e+01 + ME 6.307414889819381e-04 + +Event 247 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.577847506495702e+02 2.418237207037818e+02 -8.449121421856779e+01 2.890502538162604e+01 + 3 5.130193185035739e+02 4.381905811488919e+02 1.366496386102691e+02 2.291390669832419e+02 + 4 7.291959308468561e+02 -6.800143018526737e+02 -5.215842439170128e+01 -2.580440923648678e+02 + ME 4.051679981732582e-03 + +Event 248 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.033207479153641e+02 -5.040306065309411e+02 -2.020637997366072e+02 4.469714117975367e+02 + 3 1.758360012551319e+02 -1.471306652922548e+01 -4.035460943683618e+00 -1.751728862172264e+02 + 4 6.208432508295036e+02 5.187436730601667e+02 2.060992606802908e+02 -2.717985255803104e+02 + ME 5.612762085221702e-04 + +Event 249 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.018816177222694e+02 5.523075638651412e+01 1.752331212074551e+02 2.395316845419020e+02 + 3 6.597415560701298e+02 6.315352823685415e+01 -6.561001191322722e+02 -2.834054254405022e+01 + 4 5.383768262076012e+02 -1.183842846233684e+02 4.808669979248172e+02 -2.111911419978518e+02 + ME 4.887834528606045e-04 + +Event 250 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.166381935101299e+02 -1.289072913913530e+02 -1.189615590004072e+02 -1.271344351215278e+02 + 3 6.815426093761063e+02 -2.511966318704652e+02 5.323234433390908e+02 3.435583388650891e+02 + 4 6.018191971137629e+02 3.801039232618180e+02 -4.133618843386822e+02 -2.164239037435609e+02 + ME 3.481353570466454e-04 + +Event 251 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.676961532387151e+02 -3.991265595084289e+01 -4.419965947723095e+02 4.988628500443887e+02 + 3 7.150412702460949e+02 3.921851524844912e+01 5.505653759000155e+02 -4.545587894617490e+02 + 4 1.172625765151895e+02 6.941407023942203e-01 -1.085687811277061e+02 -4.430406058263954e+01 + ME 5.622065965300481e-04 + +Event 252 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.112668789066533e+02 -1.147554660376938e+02 3.364589711187054e+01 -1.741632301749357e+02 + 3 7.393007599584276e+02 2.529046383258835e+02 -3.593132473314827e+02 5.945576909606565e+02 + 4 5.494323611349191e+02 -1.381491722881897e+02 3.256673502196121e+02 -4.203944607857206e+02 + ME 2.717321171975897e-03 + +Event 253 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.299659304470913e+01 -4.405884533650594e+01 -5.451291667290519e+01 2.038780663930336e+01 + 3 7.253475305576840e+02 3.245698054519170e+02 -1.402290280555607e+02 -6.333397991328418e+02 + 4 7.016558763976062e+02 -2.805109601154107e+02 1.947419447284657e+02 6.129519924935382e+02 + ME 6.575402497262674e-04 + +Event 254 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.982520535096858e+02 -6.164633378269740e+01 1.773450413210087e+02 -6.365801262063786e+01 + 3 7.183815394471146e+02 -1.984891252513598e+02 -6.893152145826988e+02 -3.896971029099804e+01 + 4 5.833664070431996e+02 2.601354590340572e+02 5.119701732616901e+02 1.026277229116358e+02 + ME 9.243527649905285e-05 + +Event 255 Batch 0 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.347080663542582e+02 -5.063606624096444e+02 1.592577719822621e+02 6.440929941880935e+01 + 3 2.475406015289463e+02 -1.856063881081878e+02 3.468010668896055e+00 -1.637516137347836e+02 + 4 7.177513321167951e+02 6.919670505178326e+02 -1.627257826511581e+02 9.934231431597432e+01 + ME 1.310224262659862e-03 + +Event 0 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.775677821222390e+02 4.314431287975208e+02 -2.652567205762378e+02 -2.776332864556196e+02 + 3 6.023469575940328e+02 -3.228069847179709e+02 5.005558924007595e+02 8.978477890465912e+01 + 4 3.200852602837276e+02 -1.086361440795499e+02 -2.352991718245217e+02 1.878485075509604e+02 + ME 2.849120273644417e-05 + +Event 1 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.241206267812559e+02 3.541578305635416e+02 -4.894807402105654e+02 3.991635230623179e+02 + 3 7.375567605136828e+02 -3.903081173548693e+02 4.920451519627784e+02 -3.867054653560790e+02 + 4 3.832261270506110e+01 3.615028679132772e+01 -2.564411752212871e+00 -1.245805770623896e+01 + ME 1.004119822956703e-03 + +Event 2 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.849204091734790e+02 2.108660079931152e+02 4.054727376659824e+02 1.620962335024329e+02 + 3 2.728468517759738e+02 4.961449545460115e+01 2.005017763154939e+02 1.782774356422519e+02 + 4 7.422327390505470e+02 -2.604805034477164e+02 -6.059745139814763e+02 -3.403736691446848e+02 + ME 2.775388769533376e-02 + +Event 3 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.264155576764489e+02 -4.170952165204416e+02 -7.054834331799707e+01 5.370977042744418e+01 + 3 7.108631972082329e+02 6.832597695609467e+02 -1.727180704166534e+02 -9.301097030017993e+01 + 4 3.627212451153183e+02 -2.661645530405051e+02 2.432664137346504e+02 3.930119987273574e+01 + ME 5.477432488125450e-05 + +Event 4 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.183269968238451e+02 -3.584978055671313e+02 -5.048824553914337e+02 -3.640971079361011e+02 + 3 7.387431276480258e+02 4.013538934928405e+02 5.036810263913360e+02 3.618865629982621e+02 + 4 4.292987552812848e+01 -4.285608792570925e+01 1.201429000097645e+00 2.210544937839321e+00 + ME 3.148008415499230e-04 + +Event 5 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.529780005473894e+02 -8.443182436392422e+01 4.445408460134586e+02 -2.106590230986441e+01 + 3 4.683757780543922e+02 -6.076819021151036e+01 -1.335482427838442e+02 -4.448010379662152e+02 + 4 5.786462213982178e+02 1.452000145754346e+02 -3.109926032296145e+02 4.658669402760799e+02 + ME 8.492247748939415e-05 + +Event 6 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.238848262005391e+02 -1.065131260140052e+02 -4.741487807795933e+02 -3.912418229627632e+02 + 3 1.729069432107233e+02 -1.460869767542721e+02 -8.199113358821990e+01 4.281191710484079e+01 + 4 7.032082305887382e+02 2.526001027682773e+02 5.561399143678132e+02 3.484299058579224e+02 + ME 4.894308689384216e-04 + +Event 7 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.977203086376782e+02 -6.126072843634399e+02 -1.744636661244187e+02 2.847602033865263e+02 + 3 1.614193396272252e+02 -4.571584237043671e+00 8.497734613495713e+01 -1.371646983269120e+02 + 4 6.408603517350969e+02 6.171788686004837e+02 8.948631998946141e+01 -1.475955050596143e+02 + ME 3.553502935187301e-04 + +Event 8 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.871091945484288e+02 4.059708628308462e+02 2.886614153103367e+02 4.732666173272760e+02 + 3 5.653302025665632e+02 -2.838835484844413e+02 -7.353399035097290e+01 -4.833229987253827e+02 + 4 2.475606028850082e+02 -1.220873143464048e+02 -2.151274249593636e+02 1.005638139810630e+01 + ME 8.795665982966868e-05 + +Event 9 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.618579955503452e+02 1.385215220188489e+01 1.601201234527701e+02 -1.917484467788566e+01 + 3 7.196660585644589e+02 -4.527189715496824e+02 -4.214090439733052e+02 3.679391067910630e+02 + 4 6.184759458851961e+02 4.388668193477976e+02 2.612889205205350e+02 -3.487642621131773e+02 + ME 1.059464882196109e-03 + +Event 10 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.832785200561162e+01 1.027681340851886e+01 -7.242726264265977e+01 -2.799877018853974e+01 + 3 7.448007230566494e+02 2.520540107528716e+02 6.813719334665398e+02 1.641011304445167e+02 + 4 6.768714249377393e+02 -2.623308241613905e+02 -6.089446708238800e+02 -1.361023602559769e+02 + ME 5.920603531712434e-04 + +Event 11 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.478627446486677e+02 2.070882322301630e+02 -4.708081692757452e+02 1.887000762823861e+02 + 3 6.997827604382593e+02 -4.209013422316021e+02 4.569873120768409e+02 -3.220257264800591e+02 + 4 2.523544949130733e+02 2.138131100014392e+02 1.382085719890431e+01 1.333256501976729e+02 + ME 2.708351100630072e-05 + +Event 12 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.802868936311938e+02 -4.467002255894126e+01 5.211262762381961e+02 -2.513262266832405e+02 + 3 5.208038834706859e+02 2.151797013176277e+01 -4.993650129388666e+02 -1.463155694111945e+02 + 4 3.989092228981198e+02 2.315205242717859e+01 -2.176126329932955e+01 3.976417960944350e+02 + ME 5.063551553388695e-04 + +Event 13 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.774880087360025e+02 1.576445054854711e+02 5.481077151088401e+02 -9.065617884226722e+01 + 3 5.915098138161557e+02 -3.018001633277128e+02 -3.808656371901899e+02 3.372564123391870e+02 + 4 3.310021774478421e+02 1.441556578422419e+02 -1.672420779186502e+02 -2.466002334969198e+02 + ME 1.508559154600574e-03 + +Event 14 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.531797527967491e+02 -8.400833666640551e+01 -2.384535242035554e+02 -1.350938161690894e+01 + 3 5.261064571264828e+02 -1.751971590790252e+02 -3.334570051994592e+02 3.672878780523887e+02 + 4 7.207137900767681e+02 2.592054957454308e+02 5.719105294030146e+02 -3.537784964354798e+02 + ME 3.414896206510660e-03 + +Event 15 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.605848765362424e+02 3.563504404614685e+02 1.735853700506503e+02 2.345653669687875e+02 + 3 4.216445088607454e+02 1.370719005416187e+02 -3.933730877164850e+02 6.521502736890034e+01 + 4 6.177706146030118e+02 -4.934223410030871e+02 2.197877176658347e+02 -2.997803943376878e+02 + ME 4.641038289711456e-04 + +Event 16 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.972484926572780e+02 -1.474122335888776e+02 -4.748950276275916e+02 -6.399787981958489e-01 + 3 5.072511849723049e+02 4.846784046822066e+02 1.224000792205880e+02 -8.607455661990269e+01 + 4 4.955003223704172e+02 -3.372661710933287e+02 3.524949484070037e+02 8.671453541809866e+01 + ME 5.868696654779037e-05 + +Event 17 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.182636773520260e+02 -9.176062613973060e+01 -1.890905041641619e+02 2.389906630959087e+02 + 3 6.376303990615819e+02 -4.240378519397394e+02 2.706855745366566e+02 -3.917827786765570e+02 + 4 5.441059235863918e+02 5.157984780794702e+02 -8.159507037249485e+01 1.527921155806483e+02 + ME 7.479772407842035e-05 + +Event 18 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.532560008158404e+02 -4.148613005881325e+02 1.689647846464810e+02 -3.247047971041213e+02 + 3 3.650144721835348e+02 -1.597348634907620e+02 -2.160675866909895e+02 2.470529017650751e+02 + 4 5.817295270006247e+02 5.745961640788945e+02 4.710280204450830e+01 7.765189533904639e+01 + ME 9.166733252039524e-05 + +Event 19 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.263687475619530e+02 -1.904667433734991e+02 2.390747946355329e+02 -1.143775398573919e+02 + 3 7.331345945903580e+02 2.597391859223820e+02 -6.739404183465076e+02 1.258022320965774e+02 + 4 4.404966578476883e+02 -6.927244254888296e+01 4.348656237109746e+02 -1.142469223918529e+01 + ME 8.808463517676338e-05 + +Event 20 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 9.588718605412235e+01 4.259536217794532e+01 8.056474827260675e+01 -2.982128277051556e+01 + 3 7.250265356668370e+02 3.120913743414048e+02 -4.446787057645158e+02 4.801284204484704e+02 + 4 6.790862782790413e+02 -3.546867365193502e+02 3.641139574919092e+02 -4.503071376779549e+02 + ME 3.714661362382907e-03 + +Event 21 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.825278201605080e+02 -1.533737674675501e+02 8.574830442242747e+01 4.939757963742075e+01 + 3 7.183016103669911e+02 1.713205736990390e+02 -6.275703015775030e+02 -3.045685162014730e+02 + 4 5.991705694725006e+02 -1.794680623148891e+01 5.418219971550753e+02 2.551709365640523e+02 + ME 7.522823967260550e-05 + +Event 22 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.349542451120770e+02 9.235159917618290e+01 -2.156570331301489e+02 -1.291214495308476e+01 + 3 7.360601907662837e+02 -2.182033070539752e+02 6.568866822530020e+02 -2.503433799808774e+02 + 4 5.289855641216395e+02 1.258517078777923e+02 -4.412296491228531e+02 2.632555249339621e+02 + ME 3.912294460841818e-05 + +Event 23 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.350908908124364e+02 -7.377772511691033e+00 -2.298431804723787e+02 -4.884063683135331e+01 + 3 6.797114625392685e+02 -5.485955088721075e+02 3.603976926464840e+02 1.765336882516069e+02 + 4 5.851976466482949e+02 5.559732813837986e+02 -1.305545121741054e+02 -1.276930514202538e+02 + ME 2.065013893373332e-04 + +Event 24 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.355364173804401e+02 2.538053291625626e+02 -2.665393838801487e+02 -2.328767540869265e+02 + 3 4.093863144993796e+02 -1.953012891316529e+02 -3.573484670764558e+02 4.191221827828568e+01 + 4 6.550772681201798e+02 -5.850404003090968e+01 6.238878509566048e+02 1.909645358086408e+02 + ME 1.900003349338864e-04 + +Event 25 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.365386968907906e+02 3.875876454009266e+02 3.151568854896984e+02 5.412404333367774e+02 + 3 5.208510884285564e+02 -2.430585576296288e+02 -1.518636440371933e+02 -4.349089876054082e+02 + 4 2.426102146806531e+02 -1.445290877712977e+02 -1.632932414525050e+02 -1.063314457313692e+02 + ME 3.722647011248457e-04 + +Event 26 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.198867014174700e+02 5.189601929589824e+02 4.797253921416957e+02 -1.370428003807496e+02 + 3 3.889101953712927e+02 -1.847394503243419e+02 -2.837815501141774e+02 1.912864537085460e+02 + 4 3.912031032112369e+02 -3.342207426346404e+02 -1.959438420275182e+02 -5.424365332779645e+01 + ME 1.226552228047559e-04 + +Event 27 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.732032222628646e+02 5.870808395006010e+02 -9.126179303429218e+01 3.165595544104447e+02 + 3 1.177373967283342e+02 7.847176641415683e+01 5.304379211899001e+00 -8.761358356661104e+01 + 4 7.090593810088013e+02 -6.655526059147578e+02 8.595741382239318e+01 -2.289459708438336e+02 + ME 1.617480241086442e-03 + +Event 28 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.475300414228806e+02 3.136396845517189e+02 3.816259196370642e+02 -4.186728559156668e+02 + 3 7.290923529036073e+02 -2.791764769994178e+02 -4.112865540505715e+02 5.333662195995522e+02 + 4 1.233776056735125e+02 -3.446320755230100e+01 2.966063441350738e+01 -1.146933636838856e+02 + ME 5.041658048553457e-02 + +Event 29 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.156754590345619e+02 -2.870540678871016e+02 4.159516713841875e+01 -1.245825012466667e+02 + 3 4.770060274033895e+02 -2.355061130652809e+02 -3.231858413754910e+02 -2.600433287405434e+02 + 4 7.073185135620483e+02 5.225601809523826e+02 2.815906742370723e+02 3.846258299872100e+02 + ME 7.970413191600231e-04 + +Event 30 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.091290614221006e+02 1.543004089904795e+02 4.216196287493841e+00 -5.892468251447810e+02 + 3 2.079357839022732e+02 2.034647466922836e+02 4.185675980476621e+01 9.348729279626955e+00 + 4 6.829351546756271e+02 -3.577651556827630e+02 -4.607295609226001e+01 5.798980958651542e+02 + ME 3.913768339081636e-04 + +Event 31 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.901710072855794e+02 1.433309098684658e+01 6.447948515477651e+02 -2.457034416076623e+02 + 3 5.898919363861645e+02 1.120085307876391e+02 -4.815950471622465e+02 3.217029626736536e+02 + 4 2.199370563282564e+02 -1.263416217744857e+02 -1.631998043855182e+02 -7.599952106599136e+01 + ME 2.420139954470375e-04 + +Event 32 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.144498311923273e+02 5.832947925341469e+02 -1.925283703230111e+02 1.576726595169116e+01 + 3 2.478450424037005e+02 5.004284035329789e+01 2.389954177960992e+02 4.247433867565734e+01 + 4 6.377051264039726e+02 -6.333376328874450e+02 -4.646704747308824e+01 -5.824160462734866e+01 + ME 2.172285461632181e-04 + +Event 33 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.134536717469738e+02 -1.625429495269566e+02 -1.853973484494194e+02 5.617232593785357e+02 + 3 5.361644687950270e+02 -3.755831293394987e+01 -9.992652347025610e+01 -5.254297294928765e+02 + 4 3.503818594579993e+02 2.001012624609065e+02 2.853238719196755e+02 -3.629352988565912e+01 + ME 1.227400448166599e-04 + +Event 34 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.840838099420727e+02 -2.442269925519277e+02 -3.827314394217586e+01 -2.939535943332559e+02 + 3 6.022630974514658e+02 3.956891925431131e+01 5.086724982658300e+02 3.200116071158651e+02 + 4 5.136530926064611e+02 2.046580732976164e+02 -4.703993543236541e+02 -2.605801278260915e+01 + ME 9.632734626544979e-05 + +Event 35 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.454350783663418e+02 -3.439607925797615e+02 2.363778141880091e+01 -2.139209721976717e+01 + 3 6.705698302143293e+02 5.215327591153250e+02 4.060443141865526e+02 -1.131171661597076e+02 + 4 4.839950914193289e+02 -1.775719665355635e+02 -4.296820956053536e+02 1.345092633794747e+02 + ME 4.873234099836218e-05 + +Event 36 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.098652154429358e+02 2.489290984574327e+02 -1.674080692141068e+02 -6.433641786725617e+02 + 3 6.178479130357197e+02 -1.435715807033598e+02 2.588953561477193e+02 5.423065917191846e+02 + 4 1.722868715213448e+02 -1.053575177540730e+02 -9.148728693361247e+01 1.010575869533772e+02 + ME 6.682364323837818e-05 + +Event 37 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.906872786346029e+02 1.495946561071237e+02 1.712833879510068e+02 6.521750966909805e+02 + 3 3.682276595245591e+02 -1.358558710218083e+02 1.194309698061993e+02 -3.207351477449753e+02 + 4 4.410850618408379e+02 -1.373878508531533e+01 -2.907143577572061e+02 -3.314399489460051e+02 + ME 2.021704843794783e-03 + +Event 38 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.131720166645958e+02 -5.222102655174089e+02 6.340623138461885e+00 3.213038392347352e+02 + 3 4.540063357567761e+02 2.932429176443923e+02 -3.207297067242505e+02 -1.313879727496970e+02 + 4 4.328216475786279e+02 2.289673478730168e+02 3.143890835857887e+02 -1.899158664850381e+02 + ME 2.595762240290846e-04 + +Event 39 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.929747896182307e+02 2.510117592312212e+02 -1.378648144805472e+02 6.181113983529410e+01 + 3 6.287164314722788e+02 3.864928360026034e+01 6.254120614625330e+02 5.148142827864524e+01 + 4 5.783087789094906e+02 -2.896610428314809e+02 -4.875472469819858e+02 -1.132925681139391e+02 + ME 1.715638500913441e-04 + +Event 40 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.143487538112954e+02 -3.203572478439017e+01 1.022340126870988e+02 3.996944439980560e+01 + 3 7.361483923235807e+02 5.924235295921244e+02 -3.838567751530157e+02 -2.088128187524163e+02 + 4 6.495028538651248e+02 -5.603878048077345e+02 2.816227624659169e+02 1.688433743526105e+02 + ME 2.046625254997696e-04 + +Event 41 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.384898508133351e+02 5.540399192408263e+02 -3.014826159773289e+02 -9.908223727147148e+01 + 3 3.510407251698805e+02 -1.719168197014114e+02 2.065966849440144e+02 -2.258140996521069e+02 + 4 5.104694240167846e+02 -3.821230995394149e+02 9.488593103331456e+01 3.248963369235784e+02 + ME 4.461154075267706e-05 + +Event 42 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.291654598309212e+02 -1.090829060981258e+02 2.972891943885482e+02 -8.983292515941632e+01 + 3 6.884965239796815e+02 4.933628807557017e+02 -2.919492821202986e+02 3.812953554581829e+02 + 4 4.823380161893969e+02 -3.842799746575758e+02 -5.339912268249619e+00 -2.914624302987665e+02 + ME 6.706840264394344e-04 + +Event 43 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.674173006007982e+02 2.791827424102563e+02 1.079644067383057e+02 2.130637369397045e+02 + 3 7.392205647816576e+02 -6.110484627794917e+02 -4.247874240022369e+01 -4.138385868609020e+02 + 4 3.933621346175443e+02 3.318657203692355e+02 -6.548566433808197e+01 2.007748499211975e+02 + ME 2.748297196116919e-05 + +Event 44 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.081359682230012e+02 -1.082501549908087e+02 1.771964605001424e+02 1.427934167997762e+01 + 3 7.449563315308092e+02 5.092828751965591e+02 -5.388739609944279e+02 7.215083562608926e+01 + 4 5.469077002461893e+02 -4.010327202057504e+02 3.616775004942854e+02 -8.643017730606689e+01 + ME 1.767246359045818e-04 + +Event 45 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.180982465404428e+02 4.470261481799606e+02 -3.368837017252438e+01 -2.597277606009550e+02 + 3 3.377595659674064e+02 -7.316527185649471e+01 2.454727770679006e+02 -2.201624016839131e+02 + 4 6.441421874921517e+02 -3.738608763234668e+02 -2.117844068953764e+02 4.798901622848686e+02 + ME 1.648043339019128e-04 + +Event 46 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.296560291524888e+02 2.172411497655985e+02 5.821614514430422e+02 -1.017892054705761e+02 + 3 6.224001894826197e+02 1.405102091633609e+01 -6.218608257778047e+02 2.176414579432110e+01 + 4 2.479437813648912e+02 -2.312921706819346e+02 3.969937433476264e+01 8.002505967625511e+01 + ME 4.046505566990672e-05 + +Event 47 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.458843469271558e+02 -1.019033861791133e+02 -1.559739004096152e+02 5.131058004898495e+02 + 3 2.573134207008559e+02 6.791700498899543e+01 -2.412204887508016e+02 5.839651284901167e+01 + 4 6.968022323719882e+02 3.398638119011773e+01 3.971943891604168e+02 -5.715023133388612e+02 + ME 1.425210403448905e-02 + +Event 48 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.623920218006384e+02 -6.284562032939594e+02 -1.837527125398962e+02 -1.002044496053409e+02 + 3 1.251779629744606e+02 -7.502448682133647e+01 9.550779386908961e+01 3.031682869117444e+01 + 4 7.124300152249010e+02 7.034806901152959e+02 8.824491867080656e+01 6.988762091416655e+01 + ME 8.774865298987497e-04 + +Event 49 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.397494808364366e+02 2.393958238941667e+02 -4.144666783354253e+00 -1.233996761053011e+01 + 3 6.782491241100330e+02 -3.516321535544010e+02 -2.705899831712921e+02 5.129890485673948e+02 + 4 5.820013950535310e+02 1.122363296602345e+02 2.747346499546463e+02 -5.006490809568649e+02 + ME 9.075456735544994e-03 + +Event 50 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.764898792162554e+02 4.667163214316568e+02 5.900817880915086e+01 -7.573978570375913e+01 + 3 5.114228101321805e+02 -2.035689445851523e+02 -4.549677995197112e+02 -1.145306811477843e+02 + 4 5.120873106515638e+02 -2.631473768465044e+02 3.959596207105603e+02 1.902704668515434e+02 + ME 5.165613327883621e-05 + +Event 51 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.678795643859633e+02 4.629737719234087e+02 5.365495313512260e+01 4.108186077915564e+01 + 3 6.311645871918953e+02 -4.500610707732840e+02 -4.345770688214701e+02 8.340587481742409e+01 + 4 4.009558484221419e+02 -1.291270115012473e+01 3.809221156863476e+02 -1.244877355965798e+02 + ME 1.521494458129965e-04 + +Event 52 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.696230029266819e+02 2.516704934433110e+02 2.514038675722595e+02 1.003953305301003e+02 + 3 6.696174214325738e+02 -2.754912388418390e+01 -6.493999246431116e+02 -1.609604756850079e+02 + 4 4.607595756407442e+02 -2.241213695591271e+02 3.979960570708520e+02 6.056514515490755e+01 + ME 5.744039853307679e-05 + +Event 53 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.284624742442375e+01 -4.271742504396477e+01 -2.683807109937144e+01 -5.255012179908527e+01 + 3 7.493542950735830e+02 3.356513586119742e+02 2.501807367708783e+02 6.215139772812375e+02 + 4 6.777994575019937e+02 -2.929339335680092e+02 -2.233426656715070e+02 -5.689638554821523e+02 + ME 1.620672533701635e-02 + +Event 54 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.460259847230066e+02 2.055186857047574e+01 6.233229443227744e+02 4.093908861479222e+02 + 3 5.756222844616436e+02 2.606063779094543e+01 -4.696411468594732e+02 -3.318117699890848e+02 + 4 1.783517308153498e+02 -4.661250636142105e+01 -1.536817974633011e+02 -7.757911615883737e+01 + ME 4.378281789622456e-04 + +Event 55 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.967428482894215e+02 -8.165820254184375e+01 5.098287527914878e+02 -2.991798919868828e+02 + 3 5.942526243827265e+02 5.606061544962814e+01 -2.905196430116550e+02 5.153559216750567e+02 + 4 3.090045273278509e+02 2.559758709221549e+01 -2.193091097798325e+02 -2.161760296881746e+02 + ME 1.780685053301963e-03 + +Event 56 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.610874267302015e+02 -4.199055433713192e+02 3.580252469767042e+02 1.015694718309908e+02 + 3 6.303091265298390e+02 2.130872195586830e+02 -5.453843477211296e+02 -2.333224059286980e+02 + 4 3.086034467399593e+02 2.068183238126362e+02 1.873591007444254e+02 1.317529340977073e+02 + ME 3.263121793615722e-05 + +Event 57 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.552053965855976e+02 4.516249927537604e+02 7.110694105335413e+00 4.746350341729918e+02 + 3 6.035190443408457e+02 -3.717228873476764e+02 2.148772607224587e+02 -4.241286299324849e+02 + 4 2.412755590735561e+02 -7.990210540608395e+01 -2.219879548277939e+02 -5.050640424050682e+01 + ME 1.625819541426412e-04 + +Event 58 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.959982971085279e+02 1.850007048157144e+02 -2.304987961744356e+02 1.612563397119954e+01 + 3 7.018897389129393e+02 -3.764226030262937e+02 4.376344751014919e+02 3.992884868423145e+02 + 4 5.021119639785323e+02 1.914218982105791e+02 -2.071356789270569e+02 -4.154141208135140e+02 + ME 4.570722971945301e-03 + +Event 59 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.521089721327344e+02 1.223876815062618e+02 -3.629066091228881e+01 -5.371485459866159e+02 + 3 4.098988410471213e+02 -5.841964900319320e+01 -3.626461945087766e+02 1.819119075553315e+02 + 4 5.379921868201441e+02 -6.396803250306868e+01 3.989368554210654e+02 3.552366384312844e+02 + ME 5.158908460572081e-05 + +Event 60 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.143828168925966e+02 -4.584044193456334e+02 -2.419772079280939e+02 -4.915844060170315e+02 + 3 1.284110307517518e+02 8.324300347118131e+01 -7.889851197070544e+01 5.774963203893761e+01 + 4 6.572061523556517e+02 3.751614158744521e+02 3.208757198987993e+02 4.338347739780939e+02 + ME 1.689245141121903e-04 + +Event 61 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.394390210968652e+02 -2.137451655543887e+02 -3.779414621253705e+02 -6.767502250635194e+01 + 3 4.431311911324731e+02 3.845666395406355e+02 -2.150363068358314e+02 4.725610065709544e+01 + 4 6.174297877706620e+02 -1.708214739862470e+02 5.929777689612019e+02 2.041892184925614e+01 + ME 1.373376776280719e-04 + +Event 62 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.301725729481176e+02 4.281927891852710e+02 5.652737593150771e+02 -1.739784429324868e+02 + 3 7.567373964415995e+01 2.589885732647598e+01 -5.696550981957816e+01 4.255225906941358e+01 + 4 6.941536874077224e+02 -4.540916465117469e+02 -5.083082494954988e+02 1.314261838630732e+02 + ME 8.604883786730201e-04 + +Event 63 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.361152320236989e+02 -3.738769057978322e+02 1.427754799584549e+02 -1.732850750548248e+02 + 3 5.817148313055658e+02 5.081993893256958e+02 2.829214478037172e+02 -8.998890070513916e+00 + 4 4.821699366707354e+02 -1.343224835278637e+02 -4.256969277621723e+02 1.822839651253388e+02 + ME 4.551568062355954e-05 + +Event 64 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.097675704107204e+02 3.288514690970509e+02 4.971291587853200e+02 -1.285916042465611e+02 + 3 5.709532610348123e+02 -6.501292612520261e+01 -4.768258747557200e+02 3.072426254385416e+02 + 4 3.192791685544673e+02 -2.638385429718483e+02 -2.030328402960006e+01 -1.786510211919805e+02 + ME 4.608170816774565e-04 + +Event 65 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.258641293880481e+02 3.743515439843765e+02 -1.622018320411498e+02 -4.746128903155365e+02 + 3 7.438702198751357e+02 -4.029113627030088e+02 2.325939036896868e+02 5.804355380128616e+02 + 4 1.302656507368159e+02 2.855981871863234e+01 -7.039207164853697e+01 -1.058226476973251e+02 + ME 6.433642500589439e-03 + +Event 66 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.731957242404369e+02 1.596860493342637e+01 -3.714568973276624e+02 3.224632809376675e+01 + 3 6.079923612940432e+02 4.451199598539357e+02 3.189341902600864e+02 -2.642043054431177e+02 + 4 5.188119144655197e+02 -4.610885647873621e+02 5.252270706757586e+01 2.319579773493509e+02 + ME 4.690911012443659e-05 + +Event 67 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.084256499213536e+02 6.318790977834965e+02 -2.229764540025608e+02 2.299504472951746e+02 + 3 5.168612394424736e+01 1.130069959366449e+01 -1.428140623590626e+01 4.837138651102396e+01 + 4 7.398882261343986e+02 -6.431797973771611e+02 2.372578602384670e+02 -2.783218338061984e+02 + ME 5.940179765817433e-02 + +Event 68 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.644037677826096e+02 -7.446914007305443e+01 3.170710956176409e+02 4.609467220707991e+02 + 3 4.303832728799333e+02 -1.588265612792408e+02 -3.994808673830752e+02 -2.046757440246668e+01 + 4 5.052129593374569e+02 2.332957013522950e+02 8.240977176543437e+01 -4.404791476683325e+02 + ME 8.132470292407644e-03 + +Event 69 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.379282923937936e+02 -4.413455715133101e+01 1.058497776082811e+02 -2.084654354245804e+02 + 3 5.822935131976620e+02 -5.806422676829346e+02 4.095409019445289e+01 -1.559022092337181e+01 + 4 6.797781944085447e+02 6.247768248342657e+02 -1.468038678027338e+02 2.240556563479523e+02 + ME 3.046815523507368e-04 + +Event 70 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.861861307467999e+02 1.831219916849830e+02 2.904683423406074e+02 -4.750880530376755e+02 + 3 4.633200606614190e+02 -4.245314712871158e+02 -1.339518705596282e+02 1.284344380284136e+02 + 4 4.504938085917810e+02 2.414094796021329e+02 -1.565164717809791e+02 3.466536150092621e+02 + ME 3.535609864333284e-05 + +Event 71 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.383412459951699e+02 5.748049255568963e+02 -1.639684737984460e+02 -4.334298474879633e+02 + 3 3.973981306646684e+02 -3.228684354469153e+02 -4.837114091238284e+00 2.316416412804533e+02 + 4 3.642606233401616e+02 -2.519364901099809e+02 1.688055878896842e+02 2.017882062075102e+02 + ME 3.115226752866884e-05 + +Event 72 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.538199915090661e+02 3.512029503136998e+02 -6.467835580753928e+00 -4.246458742680748e+01 + 3 5.344234504985294e+02 1.310173344785610e+01 3.836805260246263e+01 5.328833470497181e+02 + 4 6.117565579924037e+02 -3.643046837615557e+02 -3.190021702170875e+01 -4.904187596229106e+02 + ME 9.317688030268624e-03 + +Event 73 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.694927197571709e+02 1.451947293992221e+02 -1.807863847612341e+02 4.082379055705570e+02 + 3 5.537325951281177e+02 -5.796379956652486e+01 5.401382741253894e+02 -1.072876026015002e+02 + 4 4.767746851147115e+02 -8.723092983269748e+01 -3.593518893641554e+02 -3.009503029690568e+02 + ME 1.080232956067680e-03 + +Event 74 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.258444305735200e+02 -3.349227552763228e+02 4.941036656040853e+02 1.880679848209579e+02 + 3 5.555040664889823e+02 3.765538795180095e+01 -5.474422011270133e+02 -8.645158222500019e+01 + 4 3.186515029374983e+02 2.972673673245214e+02 5.333853552292793e+01 -1.016164025959579e+02 + ME 1.626411496274593e-04 + +Event 75 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.943316317993887e+02 5.588489849751633e+01 -2.552251009651267e+02 -2.953548066221912e+02 + 3 5.467466262348044e+02 -3.021648543602058e+02 -2.377479281839000e+02 3.887212326756534e+02 + 4 5.589217419658070e+02 2.462799558626894e+02 4.929730291490267e+02 -9.336642605346221e+01 + ME 1.354571135726582e-04 + +Event 76 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.517772830004059e+02 2.282681125856672e+02 -4.885490190451381e+02 -1.169260227747471e+02 + 3 4.245403880864563e+02 -2.793100283061228e+02 1.521744876196477e+02 -2.811821020654221e+02 + 4 5.236823289131380e+02 5.104191572045557e+01 3.363745314254903e+02 3.981081248401691e+02 + ME 5.080373330052841e-05 + +Event 77 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.781543446472004e+02 -5.926925448310482e+01 -1.775497893613220e+02 3.285786605157444e+02 + 3 6.702964816234125e+02 -6.066564226432875e+01 -1.057468051743550e+02 -6.591165802199179e+02 + 4 4.515491737293868e+02 1.199348967474336e+02 2.832965945356771e+02 3.305379197041733e+02 + ME 6.322416994759335e-05 + +Event 78 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.564262045363142e+02 1.882572856930395e+02 1.751822011208170e+02 -3.770878823051469e+02 + 3 3.809544602625753e+02 -2.816334489555118e+02 1.992812047321845e+02 -1.615422627793184e+02 + 4 6.626193352011105e+02 9.337616326247232e+01 -3.744634058530015e+02 5.386301450844653e+02 + ME 2.577470802061282e-04 + +Event 79 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.126536521478916e+02 6.075062399138448e+02 -4.178945028651391e+01 6.733726903166686e+01 + 3 2.872846052831653e+02 -1.084163947926164e+02 2.139961846825775e+01 2.651799127051088e+02 + 4 6.000617425689423e+02 -4.990898451212284e+02 2.038983181825617e+01 -3.325171817367755e+02 + ME 1.997723789786618e-03 + +Event 80 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.171281258707706e+02 -2.756641813219368e+02 1.445082905894676e+01 3.127240094205691e+02 + 3 3.805235327384963e+02 -2.955852199231463e+02 2.395269588958385e+02 7.373784162959280e+00 + 4 7.023483413907346e+02 5.712494012450838e+02 -2.539777879547847e+02 -3.200977935835285e+02 + ME 1.312074770692676e-03 + +Event 81 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.471091333863934e+02 -9.753029041192968e+01 7.407154559164039e+02 -7.162458282064984e-01 + 3 6.775352561453885e+02 9.550863422814814e+01 -6.702673865908516e+02 -2.595678293896890e+01 + 4 7.535561046821789e+01 2.021656183781575e+00 -7.044806932555213e+01 2.667302876717549e+01 + ME 1.023343635320189e-04 + +Event 82 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.309094465924172e+02 3.042233433179615e+02 2.799835808203349e+02 -1.214096495919827e+02 + 3 5.540384887187944e+02 -4.824447657759212e+02 1.988969596446624e+02 1.861335391629671e+02 + 4 5.150520646887883e+02 1.782214224579596e+02 -4.788805404649973e+02 -6.472388957098455e+01 + ME 1.056627788959199e-04 + +Event 83 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.869534474909296e+02 -4.727010820510886e+02 1.062322962656183e+02 4.890855018466119e+01 + 3 3.520990385354405e+02 -1.437544586613779e+02 -3.142298368411061e+02 6.758696761482641e+01 + 4 6.609475139736300e+02 6.164555407124666e+02 2.079975405754879e+02 -1.164955177994876e+02 + ME 3.016126983476234e-04 + +Event 84 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.391975815431583e+01 -3.682657486111166e-01 -1.138840508663313e+01 -7.995516055627094e+00 + 3 7.493632094786752e+02 -3.452281541586203e+01 3.833012084573050e+02 6.429880080772213e+02 + 4 7.367170323670086e+02 3.489108116447314e+01 -3.719128033706719e+02 -6.349924920215941e+02 + ME 3.858327601029765e-01 + +Event 85 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.362448947738020e+02 6.409220704967113e+02 3.243429451315054e+02 1.614840505254833e+02 + 3 1.517836214454495e+02 -1.266859291808411e+02 -6.780846852200752e+01 4.889738933094901e+01 + 4 6.119714837807480e+02 -5.142361413158706e+02 -2.565344766094980e+02 -2.103814398564324e+02 + ME 5.726149242693629e-04 + +Event 86 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.451728369778394e+02 -6.605005893803174e+01 1.066920544886257e+02 -5.305352178712970e+02 + 3 3.158718592284831e+02 -1.755596039144848e+02 2.550395858012224e+02 6.251932981237659e+01 + 4 6.389553037936776e+02 2.416096628525166e+02 -3.617316402898482e+02 4.680158880589204e+02 + ME 1.472809320214691e-04 + +Event 87 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.414211232216658e+02 1.437256906952883e+02 1.534640422371205e+02 -2.689983214749667e+02 + 3 5.081668091119998e+02 4.794742948200324e+02 -1.464748766741244e+02 8.296394996143997e+01 + 4 6.504120676663338e+02 -6.231999855153206e+02 -6.989165562996422e+00 1.860343715135267e+02 + ME 1.827244499504264e-04 + +Event 88 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.925516585730865e+02 1.655911293372512e+01 2.598275245766865e+02 -1.334238591297045e+02 + 3 7.159840369510271e+02 -1.056844973272874e+02 -3.694097043713192e+02 6.041526284885821e+02 + 4 4.914643044758866e+02 8.912538439356234e+01 1.095821797946327e+02 -4.707287693588777e+02 + ME 8.746210059874414e-02 + +Event 89 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.333634651097187e+02 1.209853522660007e+02 5.372166546881791e+02 -3.129058794565920e+02 + 3 6.221307427802805e+02 5.757192259699379e+01 -4.327483989541182e+02 4.432391657372765e+02 + 4 2.445057921100010e+02 -1.785572748629945e+02 -1.044682557340609e+02 -1.303332862806847e+02 + ME 5.506309814939821e-04 + +Event 90 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.111538587406461e+02 2.628215106651484e+02 -6.985334981761831e+01 -1.512021390726355e+02 + 3 5.216486323898988e+02 1.252715366480781e+02 4.457714554600226e+02 -2.402335265468457e+02 + 4 6.671975088694549e+02 -3.880930473132266e+02 -3.759181056424042e+02 3.914356656194811e+02 + ME 2.332933720914389e-04 + +Event 91 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.007803348469016e+02 8.390513937949673e+01 2.884042062049404e+02 -1.586667134655829e+01 + 3 6.256884422056424e+02 2.364580673743878e+02 -3.590826126759745e+02 -4.545693416378727e+02 + 4 5.735312229474563e+02 -3.203632067538847e+02 7.067840647103418e+01 4.704360129844310e+02 + ME 6.494003266569841e-05 + +Event 92 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.843865618656529e+02 -2.264962467301474e+02 -5.909185329480341e+02 2.605757158639088e+02 + 3 6.645516272550813e+02 3.453347116263075e+02 4.983670680340541e+02 -2.720350487207342e+02 + 4 1.510618108792659e+02 -1.188384648961601e+02 9.255146491398018e+01 1.145933285682523e+01 + ME 9.375670630107641e-05 + +Event 93 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.579763469381437e+02 2.180908585044469e+02 5.135246110359700e+02 8.151996049101450e+00 + 3 3.333821836060119e+02 1.681122988324203e+02 -1.261705574188214e+02 2.587719570738212e+02 + 4 6.086414694558449e+02 -3.862031573368672e+02 -3.873540536171489e+02 -2.669239531229221e+02 + ME 5.216611746213948e-04 + +Event 94 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.534979734151987e+02 1.139662723650678e+02 2.686183171543304e+01 4.381216071501100e+02 + 3 3.856184698299742e+02 1.545134372854229e+02 -3.452526490806396e+02 7.501873282757614e+01 + 4 6.608835567548277e+02 -2.684797096504911e+02 3.183908173652065e+02 -5.131403399776862e+02 + ME 6.973420630043747e-03 + +Event 95 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.828073115974177e+02 -5.711637476392463e+01 5.915078172645689e+01 -2.705898746219726e+02 + 3 6.809618671276162e+02 3.772100991821226e+02 3.247893528880089e+02 4.646864338535507e+02 + 4 5.362308212749671e+02 -3.200937244181981e+02 -3.839401346144664e+02 -1.940965592315787e+02 + ME 2.568123623321744e-04 + +Event 96 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.639832102051441e+02 -4.275497908582962e+02 -1.317248975374901e+02 -1.230046627491649e+02 + 3 7.474114851375484e+02 6.594176555428719e+02 2.654537688070380e+02 2.309254864669503e+02 + 4 2.886053046573076e+02 -2.318678646845757e+02 -1.337288712695478e+02 -1.079208237177853e+02 + ME 2.443327128631221e-04 + +Event 97 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.095921959312568e+02 3.190102848863560e+02 3.100341192456060e+02 2.485869851668986e+02 + 3 4.555541331018014e+02 -2.788120391899956e+02 2.221549471930724e+02 -2.836205112936887e+02 + 4 5.348536709669416e+02 -4.019824569636056e+01 -5.321890664386783e+02 3.503352612679006e+01 + ME 8.220366529165536e-05 + +Event 98 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.299941952467790e+02 -2.570048161992350e+02 -4.630296380940593e+02 -2.111695271961878e+01 + 3 7.352146396921255e+02 2.361229278157243e+02 6.962552486063584e+02 3.893348873424185e+00 + 4 2.347911650610957e+02 2.088188838351074e+01 -2.332256105122990e+02 1.722360384619465e+01 + ME 6.769522853678160e-05 + +Event 99 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.290897291078425e+02 3.747236205606835e+02 2.040795775432686e+02 -4.529602465443949e+01 + 3 6.438744429739487e+02 -5.215755139094103e+02 2.133414139578182e+01 3.769325350988583e+02 + 4 4.270358279182090e+02 1.468518933487271e+02 -2.254137189390505e+02 -3.316365104444187e+02 + ME 2.029566199322513e-03 + +Event 100 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.119062275524874e+02 -4.721600394809320e+02 -1.845880136125885e+02 7.099400083769525e+01 + 3 4.523854579707451e+02 2.836789572262426e+02 -3.060214184981774e+02 -1.747276258374610e+02 + 4 5.357083144767674e+02 1.884810822546894e+02 4.906094321107658e+02 1.037336249997658e+02 + ME 6.912745684825050e-05 + +Event 101 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.024072815192739e+02 -3.080418730730875e+02 -4.692284526425155e+02 2.186993289696520e+02 + 3 3.347434020484399e+02 8.940653726951260e+01 -3.939923552329939e+01 -3.201676381969582e+02 + 4 5.628493164322862e+02 2.186353358035749e+02 5.086276881658150e+02 1.014683092273061e+02 + ME 9.306677490607613e-05 + +Event 102 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.910857738801293e+02 3.707548039128416e+02 -7.516477307090545e+01 -4.541734518311493e+02 + 3 2.311218706704978e+02 4.536804143672514e+01 -2.262982016400413e+02 1.217307902336991e+01 + 4 6.777923554493721e+02 -4.161228453495667e+02 3.014629747109467e+02 4.420003728077793e+02 + ME 2.640765676170172e-04 + +Event 103 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.627949406417043e+02 7.189602123685953e+01 -6.391860825813610e+02 -1.599038689489492e+02 + 3 5.519979886399103e+02 1.442810582977180e+02 4.734454174874869e+02 2.444057944057306e+02 + 4 2.852070707183857e+02 -2.161770795345774e+02 1.657406650938741e+02 -8.450192545678139e+01 + ME 1.656283251176821e-04 + +Event 104 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.368180791462561e+02 -3.483499330357900e+02 -2.596280064690262e+02 4.533935023690695e+01 + 3 4.635715977792427e+02 1.873023362819024e+02 -2.251347602994603e+02 -3.593477435519052e+02 + 4 5.996103230745008e+02 1.610475967538876e+02 4.847627667684864e+02 3.140083933149983e+02 + ME 9.171357862352958e-05 + +Event 105 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.701708357490472e+02 2.288495716262108e+02 -4.521314661478371e+02 -2.613422905391967e+02 + 3 3.711008490497919e+02 -3.362590561223711e+02 -8.126001400906794e+01 1.343223639771668e+02 + 4 5.587283152011615e+02 1.074094844961604e+02 5.333914801569050e+02 1.270199265620299e+02 + ME 7.066689404941827e-05 + +Event 106 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.775588183099670e+02 5.149765831731703e+02 3.445381345095062e+02 -2.741870619150275e+02 + 3 7.044100837534631e+02 -4.546975847980704e+02 -4.392260662935806e+02 3.106833358270534e+02 + 4 1.180310979365711e+02 -6.027899837509906e+01 9.468793178407483e+01 -3.649627391202603e+01 + ME 3.263550510555617e-04 + +Event 107 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.046880513041550e+02 2.289413119004024e+02 -5.349774474143720e+02 -1.644160754103498e+02 + 3 3.366746442316214e+02 -7.166101576320898e+01 2.452245434825371e+01 3.280444544890399e+02 + 4 5.586373044642237e+02 -1.572802961371934e+02 5.104549930661184e+02 -1.636283790786902e+02 + ME 8.802484405350009e-04 + +Event 108 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.239206451413978e+02 -2.218030564243363e+02 5.011455197099735e+02 -2.982172759400455e+02 + 3 2.841199272340513e+02 1.209406641294798e+02 7.967327320293103e+01 2.444374323800143e+02 + 4 5.919594276245515e+02 1.008623922948564e+02 -5.808187929129044e+02 5.377984356003118e+01 + ME 1.736932656429071e-04 + +Event 109 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.093404598873124e+02 1.546999830656544e+02 1.629193992247174e+02 2.126421988200774e+02 + 3 5.287372542258961e+02 -2.136116696975048e+02 -1.865832176193536e+02 4.462284633214169e+02 + 4 6.619222858867910e+02 5.891168663185048e+01 2.366381839463621e+01 -6.588706621414941e+02 + ME 1.697935195428601e+01 + +Event 110 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.920948406187608e+02 -8.595212543403569e+01 -4.824913009925944e+02 -4.440392734262522e+01 + 3 4.634042325716594e+02 -2.085760624772916e+00 1.255608851371820e+02 4.460645653843308e+02 + 4 5.445009268095798e+02 8.803788605880843e+01 3.569304158554124e+02 -4.016606380417056e+02 + ME 4.143743752665715e-03 + +Event 111 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.637454700443118e+02 1.543048221589588e+02 -4.372769385391800e+02 6.225902899506631e+00 + 3 3.246747011850292e+02 -5.128652792678845e+01 -2.274142471268230e+02 2.259781269206006e+02 + 4 7.115798287706587e+02 -1.030182942321705e+02 6.646911856660031e+02 -2.322040298201072e+02 + ME 1.255388191557658e-03 + +Event 112 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.923761777814548e+02 3.939190124845535e+02 4.398224952082177e+01 -5.676954684419624e+02 + 3 5.277418353503031e+02 -4.270527740856185e+02 4.970714905179167e+01 3.060499505927538e+02 + 4 2.798819868682419e+02 3.313376160106501e+01 -9.368939857261344e+01 2.616455178492086e+02 + ME 5.393645689799023e-05 + +Event 113 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.174898838850695e+02 -6.130145063482009e+02 3.726797356942233e+02 1.071275347265524e+01 + 3 1.705115822510491e+02 3.993583199494100e+01 -1.624320619120163e+02 3.309311510932530e+01 + 4 6.119985338638814e+02 5.730786743532598e+02 -2.102476737822071e+02 -4.380586858198050e+01 + ME 2.210054224183116e-04 + +Event 114 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.772826088252361e+02 -1.430288042596956e+02 -3.410390118171984e+02 5.674036356844297e+02 + 3 6.725037798358684e+02 3.626161999767237e+01 2.510744134018114e+02 -6.228226615527176e+02 + 4 1.502136113388952e+02 1.067671842620232e+02 8.996459841538710e+01 5.541902586828807e+01 + ME 8.927441997477867e-05 + +Event 115 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 9.320551230331127e+01 1.288474310894607e+01 -2.581623869377880e+01 8.862715576190527e+01 + 3 6.672654287607166e+02 1.525114284892182e+02 2.829200767588877e+02 5.847560574856375e+02 + 4 7.395290589359722e+02 -1.653961715981643e+02 -2.571038380651087e+02 -6.733832132475428e+02 + ME 1.818572874476195e+00 + +Event 116 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.951202926530015e+02 -4.575339943514647e+02 4.220102313368785e+01 1.844608951947750e+02 + 3 3.101750696753587e+02 -4.711582585559527e+01 2.172188132736168e+02 2.163438466008693e+02 + 4 6.947046376716394e+02 5.046498202070600e+02 -2.594198364073050e+02 -4.008047417956444e+02 + ME 1.955292133636328e-03 + +Event 117 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.543248494478489e+02 1.390926466871539e+02 9.107024539473490e+01 6.328510524967591e+02 + 3 5.040443237953713e+02 6.874740772121054e+01 1.336336536624387e+02 -4.811200690999848e+02 + 4 3.416308267567792e+02 -2.078400544083643e+02 -2.247038990571737e+02 -1.517309833967742e+02 + ME 4.212651810419063e-04 + +Event 118 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.829230400014206e+02 5.307803371482089e+02 -3.192285892796672e+01 2.388565162167381e+02 + 3 3.965113090906140e+02 -5.470249758902820e+01 2.256187790844517e+02 -3.214420966810604e+02 + 4 5.205656509079653e+02 -4.760778395591807e+02 -1.936959201564850e+02 8.258558046432242e+01 + ME 7.477973265320172e-05 + +Event 119 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.549567073991256e+02 2.281637891139605e+02 1.474502150787006e+02 2.284600261271838e+02 + 3 4.727085372220641e+02 7.463684946128349e+01 -3.092948822053328e+02 3.495988811576870e+02 + 4 6.723347553788105e+02 -3.028006385752440e+02 1.618446671266322e+02 -5.780589072848709e+02 + ME 1.467518013687005e-02 + +Event 120 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.192117275853698e+02 4.094232477570927e+02 -5.552624156333899e+02 -2.032775518283800e+02 + 3 3.685061529232585e+02 -2.522084621786424e+02 1.741347663658646e+02 2.046087962197375e+02 + 4 4.122821194913712e+02 -1.572147855784500e+02 3.811276492675253e+02 -1.331244391357209e+00 + ME 9.313927930293795e-05 + +Event 121 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.923953846467517e+02 -5.182078839520094e+01 -1.486351786617837e+02 -1.106262789198433e+02 + 3 6.582127150877787e+02 -3.509182841037629e+02 -1.191939510078700e+02 5.439606035624541e+02 + 4 6.493919002654695e+02 4.027390724989639e+02 2.678291296696539e+02 -4.333343246426108e+02 + ME 1.939207983209805e-03 + +Event 122 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.905732817636245e+02 3.462508192534569e+02 -5.375670569609783e+02 -2.608131264380774e+02 + 3 7.097575386120016e+02 -2.677396278645660e+02 5.849221766424141e+02 2.998954860604125e+02 + 4 9.966917962437384e+01 -7.851119138889092e+01 -4.735511968143583e+01 -3.908235962233508e+01 + ME 5.013365755272870e-04 + +Event 123 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.035126033432559e+02 2.481103298242073e+01 -3.878573016343356e+02 -1.085059780294573e+02 + 3 3.541388771651664e+02 1.572344474048876e+02 -3.105653677404273e+02 -6.512161875550808e+01 + 4 7.423485194915778e+02 -1.820454803873083e+02 6.984226693747627e+02 1.736275967849660e+02 + ME 2.050064102438738e-02 + +Event 124 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.353042728143346e+02 -4.785252055946480e+02 -2.279396245170433e+02 7.488537693644093e+01 + 3 7.454081943698109e+02 6.785307544150929e+02 3.069354144183444e+02 -3.193811081429427e+01 + 4 2.192875328158540e+02 -2.000055488204448e+02 -7.899578990130101e+01 -4.294726612214667e+01 + ME 1.401337856467237e-04 + +Event 125 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.351681880566980e+02 -1.932492970253980e+01 -4.393064933429818e+02 -5.891592456452272e+02 + 3 6.537497908129355e+02 -2.883189353576721e+01 3.454898907503182e+02 5.542510679217787e+02 + 4 1.110820211303664e+02 4.815682323830691e+01 9.381660259266361e+01 3.490817772344844e+01 + ME 1.431329278112173e-04 + +Event 126 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.568747108147123e+02 1.149185667256989e+02 4.264979152236774e+02 -3.391204725116689e+02 + 3 6.934211462641821e+02 -1.939160042589617e+02 -6.294239612595662e+02 2.169215212257339e+02 + 4 2.497041429211052e+02 7.899743753326275e+01 2.029260460358889e+02 1.221989512859350e+02 + ME 3.347046611397084e-05 + +Event 127 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.108931196972316e+02 4.270547743949553e+02 5.664613189451065e+02 -4.598718776252147e+01 + 3 4.445675167124290e+02 -1.247884466860518e+02 -4.129475031266345e+02 1.074359351009545e+02 + 4 3.445393635903407e+02 -3.022663277089035e+02 -1.535138158184720e+02 -6.144874733843321e+01 + ME 1.183537311217533e-04 + +Event 128 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.312407894292421e+02 -7.192118124205533e+01 -4.398126160332176e+02 -2.891521793453568e+02 + 3 5.717192413787027e+02 3.434745903572437e+02 1.811915566412192e+02 4.195923218357252e+02 + 4 3.970399691920551e+02 -2.715534091151883e+02 2.586210593919984e+02 -1.304401424903685e+02 + ME 1.852727275502772e-04 + +Event 129 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.644129951428380e+02 -3.595672586482286e+02 4.645590915434781e+02 3.103882489514913e+02 + 3 1.967652372382453e+02 -5.204943416929044e+01 8.794498000645014e+00 -1.895522930301723e+02 + 4 6.388217676189165e+02 4.116166928175190e+02 -4.733535895441231e+02 -1.208359559213190e+02 + ME 3.093474516972251e-04 + +Event 130 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.302263990443511e+02 -1.919590472356484e+02 3.836584700935805e+02 -5.909217345563752e+02 + 3 4.156541164903923e+02 2.203243106780774e+02 -1.767969453775071e+02 3.049071707664833e+02 + 4 3.541194844652567e+02 -2.836526344242890e+01 -2.068615247160734e+02 2.860145637898919e+02 + ME 3.121383618494824e-05 + +Event 131 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.308323688168238e+02 -1.780469473698229e+02 1.469011263880862e+02 1.710582294195633e+00 + 3 7.308075033948297e+02 5.219262643529273e+02 -3.840435213624621e+02 3.379099810545738e+02 + 4 5.383601277883466e+02 -3.438793169831045e+02 2.371423949743758e+02 -3.396205633487695e+02 + ME 1.064816672606941e-03 + +Event 132 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.909630762789656e+02 -4.293852116769705e+02 -3.988922148105424e+02 7.583335995300345e+01 + 3 5.415993952096327e+02 2.260703809971038e+02 3.221145619770359e+02 -3.721079100067703e+02 + 4 3.674375285114019e+02 2.033148306798665e+02 7.677765283350676e+01 2.962745500537669e+02 + ME 3.324711705859393e-05 + +Event 133 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.506052863582995e+02 2.189991325227701e+02 -3.914006430783633e+02 -4.347459771134344e+01 + 3 4.043998006859108e+02 3.160348074769271e+02 8.738893432792007e+01 2.366946839598571e+02 + 4 6.449949129557899e+02 -5.350339399996972e+02 3.040117087504432e+02 -1.932200862485140e+02 + ME 3.142956007462741e-04 + +Event 134 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.151470882937615e+02 -1.041377497037514e+01 -4.186394096729772e+01 7.138447461686594e+02 + 3 3.416424731356658e+02 1.638631808685802e+02 3.081581136487585e+01 -2.981925940995342e+02 + 4 4.432104385705717e+02 -1.534494058982045e+02 1.104812960242201e+01 -4.156521520691246e+02 + ME 5.519279759514214e-02 + +Event 135 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.115730144432832e+02 -3.219296530898238e+02 2.184242454110169e+02 -5.958089478700319e+02 + 3 1.627059459894212e+02 -6.880794311551747e+01 -3.259803939022061e+01 1.437917231708342e+02 + 4 6.257210395672955e+02 3.907375962053413e+02 -1.858262060207963e+02 4.520172246991979e+02 + ME 2.117985272093105e-04 + +Event 136 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.195404287114590e+02 -4.369992732083462e+02 -4.270318019286998e+02 3.800182941743400e+02 + 3 6.668605996318225e+02 3.634158794560480e+02 4.690430049045652e+02 -3.043527845290678e+02 + 4 1.135989716567186e+02 7.358339375229816e+01 -4.201120297586537e+01 -7.566550964527266e+01 + ME 1.806377591004282e-03 + +Event 137 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.722782806745004e+02 -6.045581260407006e+02 -2.538460778300669e+02 1.484241478840623e+02 + 3 6.869263774705696e+02 6.661257235671317e+02 1.481819739565760e+02 -7.865412297735674e+01 + 4 1.407953418549305e+02 -6.156759752643100e+01 1.056641038734909e+02 -6.977002490670537e+01 + ME 5.200649761334786e-04 + +Event 138 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.463287544295633e+02 8.684709774942756e+01 2.409249839962013e+02 -5.934253049048401e+02 + 3 3.917330799270068e+02 1.767690441671677e+02 4.696120064017492e+01 3.464132742372293e+02 + 4 4.619381656434300e+02 -2.636161419165952e+02 -2.878861846363762e+02 2.470120306676108e+02 + ME 5.822957333924486e-05 + +Event 139 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.994802063237942e+02 -1.272876183039154e+02 6.552211336810932e+00 2.710042891410715e+02 + 3 7.257546970836095e+02 -8.848613612326723e+00 5.127896146768585e+00 -7.256826352181578e+02 + 4 4.747650965925944e+02 1.361362319162418e+02 -1.168010748357914e+01 4.546783460770865e+02 + ME 1.720917839250544e-04 + +Event 140 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.326756101999780e+02 5.655005379385240e+02 4.343799907428445e+02 1.683351270988810e+02 + 3 7.428339005597779e+02 -5.680473426214219e+02 -4.534832054058505e+02 -1.532233754243464e+02 + 4 2.449048924024402e+01 2.546804682897962e+00 1.910321466300584e+01 -1.511175167453447e+01 + ME 4.675968868012692e-03 + +Event 141 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.363238871411324e+02 -6.772722174663235e+02 -2.824373475598686e+02 -6.086341204880691e+01 + 3 5.504260535970959e+02 4.650298533191526e+02 2.914345410616539e+02 4.221355560271698e+01 + 4 2.132500592617707e+02 2.122423641471711e+02 -8.997193501785842e+00 1.864985644608984e+01 + ME 7.311450523927662e-05 + +Event 142 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.862280565156748e+02 4.248793793116474e+01 -2.479279504752428e+02 -5.295184989682996e+02 + 3 4.287264749982904e+02 -3.025296967755301e+02 2.785471849307630e+02 1.212173201341823e+02 + 4 4.850454684860337e+02 2.600417588443672e+02 -3.061923445552111e+01 4.083011788341167e+02 + ME 4.576034531002493e-05 + +Event 143 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.464531733710510e+02 4.046044690030689e+01 -2.103865804466287e+02 1.218179201483224e+02 + 3 5.378449948854584e+02 4.607829603950881e+02 -2.747641700963840e+02 3.822241180409942e+01 + 4 7.157018317434902e+02 -5.012434072953949e+02 4.851507505430127e+02 -1.600403319524220e+02 + ME 1.298042202341705e-03 + +Event 144 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.367418008803521e+02 -1.343004856786532e+02 -4.048537736989351e+02 -3.258044847458254e+02 + 3 6.294877130859599e+02 3.313530054622211e+02 5.282137272543232e+02 8.631468610520756e+01 + 4 3.337704860336883e+02 -1.970525197835678e+02 -1.233599535553879e+02 2.394897986406179e+02 + ME 2.615543595351409e-05 + +Event 145 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.805380148481771e+01 -3.411514819754512e+01 -4.339750646760406e+01 -3.980116822894492e+01 + 3 6.831461500979880e+02 -3.834019790669201e+02 -2.756424954453614e+02 -4.936727656514237e+02 + 4 7.488000484171945e+02 4.175171272644653e+02 3.190400019129655e+02 5.334739338803686e+02 + ME 4.870767280208700e-01 + +Event 146 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.031746658797123e+02 4.202301876294930e+02 2.767377273314875e+02 2.750283520766640e+00 + 3 4.317115817339341e+02 -1.098088257924671e+02 -5.455162180567243e+01 4.139336083717602e+02 + 4 5.651137523863538e+02 -3.104213618370259e+02 -2.221861055258150e+02 -4.166838918925268e+02 + ME 4.440309177747646e-03 + +Event 147 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.251223043705629e+02 -4.223502783198937e+02 -4.694338569631594e+01 1.206377286808446e+01 + 3 5.457819748703675e+02 2.791608945230573e+02 -4.384138579515957e+02 -1.665546403390878e+02 + 4 5.290957207590694e+02 1.431893837968363e+02 4.853572436479119e+02 1.544908674710035e+02 + ME 5.831728036338814e-05 + +Event 148 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.905785821272530e+02 6.249608768654492e+02 -6.243387159972372e+01 -2.870970082698921e+02 + 3 1.361638260920091e+02 2.862044352088506e+01 1.704210379179795e+01 1.320266050727364e+02 + 4 6.732575917807409e+02 -6.535813203863348e+02 4.539176780792521e+01 1.550704031971582e+02 + ME 9.516727857169778e-04 + +Event 149 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.694705528096943e+02 -5.216497821741067e+02 -3.785079074709545e+02 1.811189935345937e+02 + 3 2.821401257551277e+02 1.148500354702071e-01 2.786662494166578e+02 -4.413795199872403e+01 + 4 5.483893214351777e+02 5.215349321386363e+02 9.984165805429679e+01 -1.369810415358696e+02 + ME 1.949391139411118e-04 + +Event 150 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.637486188995367e+02 -4.033412855298820e+02 -2.279949807412008e+02 -1.992178895453991e+01 + 3 3.756800751656201e+02 6.230662615514298e+01 -2.632310737913946e+02 -2.606967683041707e+02 + 4 6.605713059348441e+02 3.410346593747393e+02 4.912260545325954e+02 2.806185572587108e+02 + ME 2.160705042516003e-04 + +Event 151 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.821954355913596e+02 -2.528320044280691e+02 2.861764538722268e+02 1.588602445142563e+01 + 3 6.796189325418251e+02 2.911670128135292e+02 -4.900375979142739e+02 3.700902818893582e+02 + 4 4.381856318668152e+02 -3.833500838546018e+01 2.038611440420471e+02 -3.859763063407838e+02 + ME 8.216179648824654e-03 + +Event 152 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.751133298339792e+02 -2.999578895043981e+02 -2.855974213275218e+02 -5.331391803034741e+02 + 3 4.976977783498468e+02 -3.003988119418482e+00 1.843802943840355e+02 4.622747685874795e+02 + 4 3.271888918161745e+02 3.029618776238166e+02 1.012171269434863e+02 7.086441171599445e+01 + ME 1.204227971939525e-04 + +Event 153 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.729293620257127e+02 1.558357805102956e+02 -7.193392860849491e+01 2.110174585940510e+01 + 3 6.524550819255464e+02 2.410158908712478e+02 5.786677971610501e+02 1.809766692333240e+02 + 4 6.746155560487410e+02 -3.968516713815435e+02 -5.067338685525552e+02 -2.020784150927291e+02 + ME 6.033778989167836e-04 + +Event 154 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.585658455851003e+02 -2.410305357139302e+02 -2.116446673272158e+02 -5.751693564652296e+02 + 3 5.764400833248006e+02 3.388133979948971e+02 3.092747322371399e+02 3.490527051926401e+02 + 4 2.649940710900989e+02 -9.778286228096692e+01 -9.763006490992422e+01 2.261166512725895e+02 + ME 3.659509938189938e-05 + +Event 155 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.686586231936362e+02 -1.693366246265499e+02 -1.542203680657918e+02 5.204938187588980e+02 + 3 1.882190564276537e+02 -1.089234770645493e+02 -9.145416397064868e+01 1.232810822434430e+02 + 4 7.431223203787106e+02 2.782601016910993e+02 2.456745320364405e+02 -6.437749010023409e+02 + ME 6.832012408051167e-01 + +Event 156 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.143652095725132e+02 2.879464601546110e+02 5.379391909976825e+02 -7.178351904348051e+01 + 3 6.287751645293093e+02 -4.584164185734782e+02 -4.225140875260601e+02 -8.181956094447750e+01 + 4 2.568596258981783e+02 1.704699584188668e+02 -1.154251034716223e+02 1.536030799879582e+02 + ME 2.902290331632261e-05 + +Event 157 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.050842109798971e+02 4.185498850973046e+02 -1.305174306570672e+02 -2.507812875014723e+02 + 3 5.170424494038049e+02 -3.084595065654855e+02 3.930456446728388e+02 -1.330441599566700e+02 + 4 4.778733396162974e+02 -1.100903785318191e+02 -2.625282140157716e+02 3.838254474581423e+02 + ME 4.038264048703095e-05 + +Event 158 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.312542366204099e+02 -3.114503370626313e+02 2.737030704635237e+02 1.185982013584742e+02 + 3 6.944315393047832e+02 2.166643175309469e+02 -6.173965008138002e+02 -2.326226495269425e+02 + 4 3.743142240748071e+02 9.478601953168445e+01 3.436934303502765e+02 1.140244481684682e+02 + ME 3.688913238842345e-05 + +Event 159 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.860112473308646e+02 -1.581297551692178e+02 4.935632758462007e+02 2.734948907463652e+02 + 3 3.772013313646349e+02 -2.371132827856262e+02 -1.305099443644436e+02 -2.627266448837395e+02 + 4 5.367874213045002e+02 3.952430379548442e+02 -3.630533314817573e+02 -1.076824586262577e+01 + ME 1.032822977989639e-04 + +Event 160 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.883409724804535e+02 -3.739819298758818e+02 -2.887651121595530e+02 3.505671490956299e+02 + 3 4.300332553173178e+02 1.788055146224819e+02 3.829208006453583e+02 7.955406370837679e+01 + 4 4.816257722022288e+02 1.951764152533999e+02 -9.415568848580531e+01 -4.301212128040067e+02 + ME 9.827203455379468e-03 + +Event 161 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.868305165969143e+02 4.119610488151632e+00 5.515184990814984e+02 4.093244831537709e+02 + 3 3.260821955312832e+02 -1.956999890649130e+02 -2.483451099187457e+02 -7.972338993006395e+01 + 4 4.870872878718019e+02 1.915803785767614e+02 -3.031733891627526e+02 -3.296010932237068e+02 + ME 1.078228281419303e-03 + +Event 162 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.159818802305118e+02 -2.018126805027918e+02 4.096951387107713e+01 -6.512536763314942e+01 + 3 6.870078865581223e+02 4.896730732821633e+02 -2.356527215298930e+02 -4.203188222421332e+02 + 4 5.970102332113653e+02 -2.878603927793715e+02 1.946832076588155e+02 4.854441898752826e+02 + ME 5.381251197778350e-05 + +Event 163 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.889699854403285e+02 -4.067839821807849e+01 -2.740835242435768e+02 4.028835269878221e+02 + 3 4.282392920294496e+02 4.007468150560175e+02 -8.832740907173850e+01 -1.224301852772270e+02 + 4 5.827907225302216e+02 -3.600684168379391e+02 3.624109333153152e+02 -2.804533417105952e+02 + ME 4.360157528892435e-04 + +Event 164 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.224346677404146e+02 -1.282049393554145e+02 5.480608628970116e+02 -2.657399098565701e+02 + 3 7.444531740822748e+02 1.794330131141779e+02 -6.708967511266459e+02 2.681638893170602e+02 + 4 1.331121581773107e+02 -5.122807375876334e+01 1.228358882296343e+02 -2.423979460490185e+00 + ME 1.370014400319146e-04 + +Event 165 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.980339706506675e+02 -5.154669325341684e+01 -4.947847840614098e+02 4.896757907618869e+02 + 3 1.362964882116331e+02 4.252532371924361e+01 -5.641238783031591e+01 -1.165588780002596e+02 + 4 6.656695411377010e+02 9.021369534174053e+00 5.511971718917263e+02 -3.731169127616273e+02 + ME 1.459298868835044e-03 + +Event 166 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.060640747281172e+02 -1.981167412190919e+02 -9.095380261170787e+01 -2.148310510107331e+02 + 3 5.580104478575087e+02 -3.585720992432472e+02 -1.558095186186281e+02 3.981521109704928e+02 + 4 6.359254774143742e+02 5.566888404623390e+02 2.467633212303362e+02 -1.833210599597597e+02 + ME 3.018805019322452e-04 + +Event 167 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.833153623322893e+02 2.526850217013923e+02 8.687924899084067e+01 9.417998957332070e+01 + 3 6.595685044563417e+02 -8.780626893611857e+01 -2.875856231737450e+02 -5.870393347553995e+02 + 4 5.571161332113691e+02 -1.648787527652739e+02 2.007063741829043e+02 4.928593451820789e+02 + ME 7.385299698630873e-05 + +Event 168 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.026267479353970e+02 -5.987968578530476e+02 5.775180228477304e+00 6.758674164241535e+01 + 3 4.991211680715714e+02 3.812575567959844e+02 3.220701575873952e+02 -5.952259631185695e+00 + 4 3.982520839930310e+02 2.175393010570631e+02 -3.278453378158729e+02 -6.163448201122965e+01 + ME 9.626748820055277e-05 + +Event 169 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.510662376679774e+02 -9.251111075413948e+01 -5.291920243323356e+02 -1.227660134875281e+02 + 3 5.034535790022879e+02 -2.816014265681678e+02 3.283802195198171e+02 2.575511098657944e+02 + 4 4.454801833297350e+02 3.741125373223072e+02 2.008118048125186e+02 -1.347850963782664e+02 + ME 1.537032552332358e-04 + +Event 170 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.814808559369750e+02 3.658097943502283e+01 -1.412301634042881e+02 -2.407225480659936e+02 + 3 6.646522150540472e+02 2.753499086551697e+02 -1.631412967142655e+02 5.825203104495404e+02 + 4 5.538669290089781e+02 -3.119308880901925e+02 3.043714601185536e+02 -3.417977623835468e+02 + ME 7.865718803076825e-04 + +Event 171 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.777965289077954e+02 -6.143496808852239e+01 -1.603735842336766e+00 1.668375809551635e+02 + 3 7.439290290569696e+02 2.163074211412066e+01 -1.907051550939618e+01 -7.433699124308462e+02 + 4 5.782744420352348e+02 3.980422597440168e+01 2.067425135173310e+01 5.765323314756826e+02 + ME 2.064506737469714e-03 + +Event 172 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.369499454750680e+02 -1.250080331667567e+01 -3.518152151649629e+01 -1.317622025690455e+02 + 3 6.692885586315896e+02 -2.346283187163472e+02 -6.130705295376303e+02 1.305421486874673e+02 + 4 6.937614958933425e+02 2.471291220330227e+02 6.482520510541266e+02 1.220053881578281e+00 + ME 5.063032635761837e-04 + +Event 173 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.088772083623148e+02 4.973951266878910e+01 3.171232495758635e+01 -7.064185769505257e+02 + 3 5.785136264307897e+02 8.584813303397833e+01 5.766505028397116e+01 5.691949191590091e+02 + 4 2.126091652068945e+02 -1.355876457027673e+02 -8.937737524155736e+01 1.372236577915166e+02 + ME 1.725049542963853e-04 + +Event 174 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.367208701713478e+02 -3.923163287174697e+01 4.325755195957346e+02 -4.543585887727656e+01 + 3 3.528978856725083e+02 9.622572295106897e+01 1.987077746703232e+02 -2.753048278549414e+02 + 4 7.103812441561447e+02 -5.699409007932230e+01 -6.312832942660565e+02 3.207406867322184e+02 + ME 9.370305693890175e-04 + +Event 175 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.418562164876807e+02 1.962785648722137e+02 -6.110736372974048e+02 -6.567908015856713e+00 + 3 4.843421844702150e+02 -1.886631806266161e+02 3.569879071908528e+02 -2.674942804112338e+02 + 4 3.738015990421036e+02 -7.615384245597570e+00 2.540857301065516e+02 2.740621884270907e+02 + ME 3.032139219937057e-05 + +Event 176 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.288652703123263e+02 4.005522031116294e+02 3.691482793515075e+02 3.142594606996526e+02 + 3 7.209127580467475e+02 -4.124575135572966e+02 -5.165298058232565e+02 -2.877341896975221e+02 + 4 1.502219716409257e+02 1.190531044566672e+01 1.473815264717492e+02 -2.652527100213051e+01 + ME 1.722342549898366e-04 + +Event 177 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.716578040000081e+02 -4.521622645932389e+02 -1.012739918234156e+01 1.338200520767546e+02 + 3 3.021382980750608e+02 -2.714821202364266e+02 6.773215888881046e+01 -1.140059832109246e+02 + 4 7.262038979249323e+02 7.236443848296656e+02 -5.760475970646935e+01 -1.981406886582875e+01 + ME 2.366946674082088e-03 + +Event 178 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.350088877399502e+02 -3.684484945749095e+02 -2.561732769425163e+02 -5.821159885132296e+02 + 3 1.415495174310248e+02 7.181268644032879e+01 1.095010133995263e+02 5.374692563910759e+01 + 4 6.234415948290248e+02 2.966358081345808e+02 1.466722635429900e+02 5.283690628741219e+02 + ME 1.046585665999442e-04 + +Event 179 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.426064621425415e+02 6.748632301344054e+01 7.201624948975953e+02 -1.681544967131679e+02 + 3 5.821031882499328e+02 8.394276920418474e-01 -5.588194474899292e+02 1.629854049874920e+02 + 4 1.752903496075257e+02 -6.832575070548242e+01 -1.613430474076661e+02 5.169091725675909e+00 + ME 9.213110738572106e-05 + +Event 180 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.099515195485482e+02 2.272495331206022e+02 1.762692760011278e+02 -5.378918555193874e+02 + 3 5.718889655176698e+02 4.324570510796980e+01 -3.278409766521432e+02 4.665909256493895e+02 + 4 3.181595149337818e+02 -2.704952382285720e+02 1.515717006510154e+02 7.130092986999803e+01 + ME 5.414868413371670e-05 + +Event 181 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.206370886915177e+02 -8.151225636567757e+01 1.767749325039422e+01 8.715827822142556e+01 + 3 6.451493408002738e+02 -6.748216257939075e+01 4.373428479320614e+02 4.694625256943417e+02 + 4 7.342135705082084e+02 1.489944189450684e+02 -4.550203411824557e+02 -5.566208039157672e+02 + ME 7.247843317331434e-02 + +Event 182 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.626866082364763e+02 -3.084610429505738e+02 3.306629079434072e+02 9.794245113140902e+01 + 3 4.974966719253475e+02 3.582955998671218e+02 1.664640547097976e+02 -3.023523113558579e+02 + 4 5.398167198381767e+02 -4.983455691654799e+01 -4.971269626532049e+02 2.044098602244490e+02 + ME 5.969440593614299e-05 + +Event 183 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.304723045950490e+02 3.244647182058515e+00 3.209425641774955e+02 7.872284845075714e+01 + 3 4.379804819457451e+02 2.312428523500661e+02 3.131807483468383e+02 2.006775141049615e+02 + 4 7.315472134592064e+02 -2.344874995321246e+02 -6.341233125243343e+02 -2.794003625557186e+02 + ME 4.959080546808864e-03 + +Event 184 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.470051035005912e+02 -4.953964753944514e+02 -4.028924750569615e+02 3.876552725878487e+02 + 3 2.183325716323390e+02 1.119040172022778e+02 1.451703047217021e+02 -1.186262424448778e+02 + 4 5.346623248670699e+02 3.834924581921737e+02 2.577221703352594e+02 -2.690290301429710e+02 + ME 5.452709005037568e-04 + +Event 185 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.448583927494090e+02 2.810173563272025e+02 -3.384637477435971e+02 6.610995769032235e+01 + 3 6.236443795626774e+02 -1.690803760724666e+02 5.125139620028375e+02 3.125277225134823e+02 + 4 4.314972276879136e+02 -1.119369802547359e+02 -1.740502142592404e+02 -3.786376802038046e+02 + ME 6.967293094522542e-03 + +Event 186 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.802792190696962e+02 -1.681815241656754e+02 5.427923640013703e+02 3.739936368565512e+02 + 3 6.331554869749547e+02 3.172201723440435e+02 -4.588808692389625e+02 -2.994755095011972e+02 + 4 1.865652939553488e+02 -1.490386481783679e+02 -8.391149476240781e+01 -7.451812735535422e+01 + ME 3.281236112138701e-04 + +Event 187 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.472897115267964e+02 -6.988402471604772e+02 -2.391684329048670e+02 1.134137672609268e+02 + 3 6.826908170748525e+02 6.328852277257668e+02 2.212839847556716e+02 -1.286718241709738e+02 + 4 7.001947139835137e+01 6.595501943471052e+01 1.788444814919546e+01 1.525805691004725e+01 + ME 1.462619207180431e-04 + +Event 188 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.496068877140277e+02 -5.024316730938292e+02 -3.980061777252907e+02 -1.055585379310703e+02 + 3 4.885976180718370e+02 4.424928723138696e+02 1.459942636040002e+02 -1.470148473169288e+02 + 4 3.617954942141355e+02 5.993880077995961e+01 2.520119141212904e+02 2.525733852479991e+02 + ME 2.846993304776540e-05 + +Event 189 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.082379946778651e+02 2.679237131173331e+02 -7.718184435750958e+01 2.981913934867988e+02 + 3 5.864211573889180e+02 -5.780822197382727e+02 -6.394893886953381e+01 7.497502433004088e+01 + 4 5.053408479332165e+02 3.101585066209396e+02 1.411307832270433e+02 -3.731664178168398e+02 + ME 1.944788176545440e-03 + +Event 190 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.472516823166367e+02 6.463779961822676e+02 -3.289365889632787e+01 6.945035458816682e+00 + 3 4.318767277050752e+02 -3.286790725415816e+02 -7.183748821760633e+00 -2.800642229191640e+02 + 4 4.208715899782886e+02 -3.176989236406860e+02 4.007740771808847e+01 2.731191874603473e+02 + ME 3.412998170884568e-05 + +Event 191 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.757500036387050e+02 6.222744522021635e+02 -2.261571472854043e+02 1.351499844096745e+02 + 3 3.644673602666566e+02 -2.020102809038697e+02 1.114149692296406e+02 -2.821613151026251e+02 + 4 4.597826360946378e+02 -4.202641712982939e+02 1.147421780557638e+02 1.470113306929506e+02 + ME 5.395261495326662e-05 + +Event 192 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.394562478491533e+02 -7.307873850878615e+02 3.988568028534699e+01 1.056147375500684e+02 + 3 8.098058518630977e+01 5.419286926826392e+01 4.244928426361276e+00 -6.002473390399247e+01 + 4 6.795631669645364e+02 6.765945158195975e+02 -4.413060871170818e+01 -4.559000364607595e+01 + ME 4.227234596358380e-04 + +Event 193 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.607395612273152e+02 -3.164229781907933e+02 -3.517992386171808e+02 -3.009030576558548e+02 + 3 3.741643617741926e+02 -2.156271676189966e+02 1.666697084176705e+02 2.563690747778811e+02 + 4 5.650960769984921e+02 5.320501458097899e+02 1.851295301995103e+02 4.453398287797368e+01 + ME 9.187779745624233e-05 + +Event 194 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.729373416862012e+02 -2.155045544874616e+02 -1.679805246197324e+02 5.035846779262560e+02 + 3 2.831035485618876e+02 -2.543279085173982e+02 1.042261812492671e+02 -6.783684323208051e+01 + 4 6.439591097519117e+02 4.698324630048597e+02 6.375434337046515e+01 -4.357478346941755e+02 + ME 1.795281778009961e-03 + +Event 195 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.572874060171201e+02 -5.433144409127298e+02 3.646295232533866e+01 1.185290019729285e+02 + 3 6.765845568040619e+02 5.574999049241243e+02 -1.212989803269169e+01 -3.831623469093195e+02 + 4 2.661280371788181e+02 -1.418546401139455e+01 -2.433305429264712e+01 2.646333449363910e+02 + ME 3.403395687100383e-04 + +Event 196 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.405888343305829e+02 3.940239871950471e+02 -8.826690628749978e+01 -3.594305754554688e+02 + 3 6.983754392688073e+02 -3.888370902622853e+02 -5.513072771506091e+01 5.774898910559966e+02 + 4 2.610357264006097e+02 -5.186896932761887e+00 1.433976340025607e+02 -2.180593156005277e+02 + ME 5.545021598610717e-03 + +Event 197 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.783346334111662e+02 2.282410890438732e+02 -1.474467226896363e+02 6.029624695020832e+01 + 3 6.434654504578667e+02 1.172104173128903e+01 6.205939438823053e+02 1.696277097949658e+02 + 4 5.781999161309676e+02 -2.399621307751624e+02 -4.731472211926696e+02 -2.299239567451741e+02 + ME 3.335909098409914e-04 + +Event 198 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.349536439683952e+02 1.774777254208014e+02 -9.709992209949115e+01 3.850427697141143e+02 + 3 4.134500153047131e+02 7.095914770071856e+01 -4.041194890923879e+02 -5.092301099466206e+01 + 4 6.515963407268930e+02 -2.484368731215194e+02 5.012194111918788e+02 -3.341197587194524e+02 + ME 7.918644172557692e-04 + +Event 199 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.682109290882580e+02 2.136897997740939e+02 -5.035763266519416e+02 3.837361052354048e+02 + 3 1.424120473397155e+02 8.952788458880865e+01 -4.686863299276860e+01 -1.003458038481504e+02 + 4 6.893770235720265e+02 -3.032176843629025e+02 5.504449596447103e+02 -2.833903013872543e+02 + ME 1.176385421247642e-03 + +Event 200 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.959952693237886e+02 -4.878566955018547e+02 -2.510837703973927e+01 -3.414319479966339e+02 + 3 4.479637599869171e+02 4.499951041477977e+01 7.146287716862109e+01 4.399313940955211e+02 + 4 4.560409706892943e+02 4.428571850870749e+02 -4.635450012888172e+01 -9.849944609888662e+01 + ME 5.500878438581106e-04 + +Event 201 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.203096708642927e+02 -1.112696379946441e+02 1.367824427202020e+02 4.895219960522141e+02 + 3 2.871951825199399e+02 -2.582762312778227e+02 1.200876310962787e+02 3.678888524092984e+01 + 4 6.924951466157675e+02 3.695458692724667e+02 -2.568700738164807e+02 -5.263108812931440e+02 + ME 6.656074573136404e-03 + +Event 202 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.158792376054218e+02 2.112389782008979e+01 -7.195062193526134e+01 -2.024369881546198e+02 + 3 5.463652944256572e+02 2.787950008966255e+02 -3.108926376755555e+02 -3.523267663221479e+02 + 4 7.377554679689214e+02 -2.999188987167153e+02 3.828432596108168e+02 5.547637544767679e+02 + ME 8.717650179896926e-03 + +Event 203 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.124273471334272e+02 4.879265047129838e+02 -1.059167473143779e+02 -5.081949365946949e+02 + 3 6.746108110440505e+02 -5.248642991835990e+02 4.352799102536775e+01 4.215714978711399e+02 + 4 1.129618418225217e+02 3.693779447061508e+01 6.238875628901039e+01 8.662343872355493e+01 + ME 5.367797674979175e-05 + +Event 204 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.084787759842806e+02 4.992472551829617e+02 -4.528122431715626e+02 -2.183012291454193e+02 + 3 1.034373169902747e+02 -8.959882065299325e+01 -3.938861547415053e+01 -3.346441176487074e+01 + 4 6.880839070254442e+02 -4.096484345299685e+02 4.922008586457130e+02 2.517656409102900e+02 + ME 3.012058085552873e-04 + +Event 205 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.496569846879349e+02 -5.869603795046560e+02 -2.345911576090251e+02 1.499956646614410e+02 + 3 2.543878192344406e+02 -1.851019090219872e+00 2.474675926596849e+02 -5.890268997594536e+01 + 4 5.959551960776247e+02 5.888113985948759e+02 -1.287643505065981e+01 -9.109297468549572e+01 + ME 1.878338063785423e-04 + +Event 206 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.172060642836409e+02 2.978040691523503e+02 4.166709400833432e+02 3.444435946201742e+02 + 3 7.205754982426179e+02 -2.468045809177360e+02 -5.690387091428451e+02 -3.667580878490107e+02 + 4 1.622184374737408e+02 -5.099948823461420e+01 1.523677690595017e+02 2.231449322883639e+01 + ME 7.370384336338383e-05 + +Event 207 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.250113096394139e+02 -1.091977068802181e+02 -4.322753509449321e+02 2.772196909074646e+02 + 3 5.240251005653129e+02 3.541948269240045e+02 3.738549241960731e+02 9.685466564450641e+01 + 4 4.509635897952731e+02 -2.449971200437864e+02 5.842042674885890e+01 -3.740743565519710e+02 + ME 3.388857710130573e-03 + +Event 208 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.449444343820048e+02 1.928662436733418e+02 -3.595193210859464e+02 1.775500478872298e+02 + 3 4.894053462810563e+02 -2.195789585225566e+02 2.295326432211599e+02 3.723136307450180e+02 + 4 5.656502193369389e+02 2.671271484921491e+01 1.299866778647864e+02 -5.498636786322478e+02 + ME 2.077582503030861e-01 + +Event 209 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.949423498078045e+02 -2.830370809537592e+02 -1.684680620467475e+02 -3.694271951395290e+02 + 3 6.326444171345162e+02 3.898538983719824e+02 -1.748162179498051e+02 4.665749526039371e+02 + 4 3.724132330576787e+02 -1.068168174182232e+02 3.432842799965526e+02 -9.714775746440787e+01 + ME 1.477263061598869e-04 + +Event 210 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.469464199121013e+02 -4.947084169679944e+02 2.319240083666634e+02 -2.500445517953787e+01 + 3 2.929141603572805e+02 -5.602902696925144e+01 2.099470855189297e+01 2.867379913571110e+02 + 4 6.601394197306176e+02 5.507374439372460e+02 -2.529187169185561e+02 -2.617335361775728e+02 + ME 1.572734078973362e-03 + +Event 211 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.484404249965426e+02 1.659778109685240e+01 3.514591842057613e+02 -4.206992456262191e+02 + 3 4.635537606517393e+02 -3.607884938122542e+02 -3.140996451540818e+01 2.893564685231623e+02 + 4 4.880058143517180e+02 3.441907127154017e+02 -3.200492196903531e+02 1.313427771030569e+02 + ME 5.014972467751673e-05 + +Event 212 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.930853388432640e+02 -3.424793196872474e+02 -8.152110066892752e+01 5.970171795281683e+02 + 3 9.131624224772825e+01 6.738328155058524e+01 1.365968298972706e+01 6.009627714210347e+01 + 4 7.155984189090078e+02 2.750960381366621e+02 6.786141767920040e+01 -6.571134566702718e+02 + ME 3.256503666439727e-01 + +Event 213 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.316448870278512e+02 4.203233031264803e+02 4.913598772661251e+02 -3.423419819067778e+02 + 3 4.750162603483208e+02 -1.726357548525294e+02 -3.708603862154638e+02 2.414537588813190e+02 + 4 2.933388526238279e+02 -2.476875482739507e+02 -1.204994910506614e+02 1.008882230254589e+02 + ME 4.018906938535000e-05 + +Event 214 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.805779599533694e+02 3.904513572450257e+02 -1.742898429406511e+02 2.193763065287195e+02 + 3 6.164938851206517e+02 -5.563771061772993e+02 2.227142270499353e+02 1.445946028815716e+02 + 4 4.029281549259790e+02 1.659257489322735e+02 -4.842438410928419e+01 -3.639709094102910e+02 + ME 1.133147876913260e-02 + +Event 215 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.610896439725646e+02 -3.106576460930040e+02 -3.050258363865883e+02 -1.518378274323046e+02 + 3 7.153470686812822e+02 2.726436938726978e+02 6.046054769368645e+02 2.680280994976065e+02 + 4 3.235632873461536e+02 3.801395222030654e+01 -2.995796405502761e+02 -1.161902720653026e+02 + ME 2.133719548484640e-04 + +Event 216 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.309452696424390e+02 -4.912950836090374e+02 -3.608909251460835e+01 -1.980646298023531e+02 + 3 6.627369363365401e+02 4.479096066616001e+02 2.308759280187053e+02 4.304573578259470e+02 + 4 3.063177940210213e+02 4.338547694743725e+01 -1.947868355040969e+02 -2.323927280235939e+02 + ME 1.884481162219252e-03 + +Event 217 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.608032244164872e+02 2.215832851737383e+02 3.318832460795877e+02 -2.304212888079595e+02 + 3 3.107022283044696e+02 -4.724697178681159e+01 2.830528592337837e+02 -1.190994425256425e+02 + 4 7.284945472790436e+02 -1.743363133869267e+02 -6.149361053133714e+02 3.495207313336019e+02 + ME 2.900400298383267e-03 + +Event 218 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.336891602166278e+02 5.249943224110906e+02 1.648031440577740e+02 -3.142973702098818e+02 + 3 5.195346944320728e+02 -3.655895580768900e+02 -3.610279413409488e+02 7.693763263116620e+01 + 4 3.467761453512955e+02 -1.594047643342020e+02 1.962247972831738e+02 2.373597375787181e+02 + ME 2.707470513533884e-05 + +Event 219 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.579228498517418e+02 -4.166553381892275e+01 1.191899344508914e+02 2.249042891828000e+02 + 3 7.453266221408655e+02 -3.354388163550536e+01 -3.947818065141065e+02 -6.312954196904916e+02 + 4 4.967505280073932e+02 7.520941545442811e+01 2.755918720632151e+02 4.063911305076915e+02 + ME 6.137847339595072e-05 + +Event 220 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.940336288355579e+02 -2.383755021420816e+02 -2.918661661143954e+02 3.194690712363630e+02 + 3 7.129224521449783e+02 2.727447507998267e+02 2.535039959962390e+02 -6.079510240944472e+02 + 4 2.930439190194636e+02 -3.436924865774515e+01 3.836217011815622e+01 2.884819528580837e+02 + ME 1.763978243357078e-04 + +Event 221 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.305414381337778e+02 -2.712796684963201e+02 -1.199910663213095e+02 -1.458325333632649e+02 + 3 7.388441803280767e+02 5.510455284380057e+02 4.375213740715826e+02 2.254209298704556e+02 + 4 4.306143815381457e+02 -2.797658599416857e+02 -3.175303077502730e+02 -7.958839650719051e+01 + ME 1.340250572754580e-04 + +Event 222 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.657562074797755e+02 2.823280548971349e+02 2.956503281023744e+02 2.231828795335844e+02 + 3 4.791948192186352e+02 -3.228825926298714e+02 2.575611801233854e+02 -2.429747818931872e+02 + 4 5.550489733015892e+02 4.055453773273639e+01 -5.532115082257600e+02 1.979190235960288e+01 + ME 9.068666738309424e-05 + +Event 223 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.612164685986320e+02 -4.527922182271190e+01 -1.095260585492910e+01 1.543391792239739e+02 + 3 6.984218503485874e+02 -4.629950983513679e+02 2.605715575888555e+02 -4.533553609726804e+02 + 4 6.403616810527803e+02 5.082743201740798e+02 -2.496189517339264e+02 2.990161817487065e+02 + ME 4.123399220657122e-04 + +Event 224 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.663853414671973e+02 -1.350882138037310e+02 9.706071747767020e+01 3.804401292344737e+00 + 3 6.436745581417565e+02 -4.469273298203082e+02 -4.412749113764767e+02 -1.408877256838113e+02 + 4 6.899401003910461e+02 5.820155436240392e+02 3.442141938988061e+02 1.370833243914661e+02 + ME 3.471159351394509e-04 + +Event 225 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.702356777533546e+02 6.117158080352369e+02 -2.649249521350114e+02 -6.952987609335720e+01 + 3 6.901224376513153e+02 -6.564819557015361e+02 1.560869289536551e+02 1.446972404640001e+02 + 4 1.396418845953297e+02 4.476614766629927e+01 1.088380231813564e+02 -7.516736437064299e+01 + ME 6.416862637290038e-04 + +Event 226 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.307777643673113e+02 -4.569648094661606e+02 4.416236342013199e+02 -3.608155616351098e+02 + 3 1.446420186345138e+02 4.133161435221924e+01 -3.411742569426914e+01 1.343466131828505e+02 + 4 6.245802169981753e+02 4.156331951139413e+02 -4.075062085070508e+02 2.264689484522593e+02 + ME 4.842581063143803e-04 + +Event 227 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.408615397889291e+02 -4.398089081634772e+02 -5.325812259979131e+02 2.679574278743412e+02 + 3 4.035753807128125e+02 3.000971513323747e+02 2.468113220276344e+02 -1.090823496201683e+02 + 4 3.555630794982586e+02 1.397117568311025e+02 2.857699039702786e+02 -1.588750782541728e+02 + ME 3.220421479671745e-04 + +Event 228 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.775455372723294e+02 -3.656199842755111e+02 -6.289501053880601e+01 4.426342647953073e+02 + 3 3.247306314578497e+02 8.776645762339837e+01 3.116872137482897e+02 2.445634292125525e+01 + 4 5.977238312698206e+02 2.778535266521127e+02 -2.487922032094836e+02 -4.670906077165625e+02 + ME 3.172625430011347e-03 + +Event 229 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.665477125629452e+02 -2.081014917770363e+02 2.317985113364040e+02 -1.931850016112187e+02 + 3 6.187040836990478e+02 -2.134593092471877e+02 -3.484367286517815e+02 4.645661552545953e+02 + 4 5.147482037380066e+02 4.215608010242241e+02 1.166382173153775e+02 -2.713811536433765e+02 + ME 4.407225536120210e-04 + +Event 230 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.913978529013565e+02 -4.986092821675884e+02 -3.028328044703766e+02 9.712104143419771e+01 + 3 3.439186614041001e+02 -6.573524045766425e+01 3.216488491089061e+02 -1.024741025375549e+02 + 4 5.646834856945436e+02 5.643445226252528e+02 -1.881604463852933e+01 5.353061103357446e+00 + ME 1.070100355610459e-04 + +Event 231 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.760768557894827e+02 -7.075794524290799e+01 5.609870884449791e+02 1.102331327656218e+02 + 3 6.038619762337338e+02 -2.467027894308989e+02 -5.464177649873398e+02 -7.221250677108812e+01 + 4 3.200611679767834e+02 3.174607346738069e+02 -1.456932345763944e+01 -3.802062599453370e+01 + ME 8.766665364889563e-05 + +Event 232 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.230187249684843e+02 -2.426041066061352e+02 1.884455685697195e+02 -6.545132479937492e+02 + 3 4.821326920133731e+02 2.438648429837413e+02 -1.563760752388982e+01 4.156168142598493e+02 + 4 2.948485830181424e+02 -1.260736377606032e+00 -1.728079610458298e+02 2.388964337338999e+02 + ME 4.558440708943279e-05 + +Event 233 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.540260977608100e+02 -1.904526694678991e+02 -1.042089619355360e+02 -2.796475475319170e+02 + 3 4.925592302096041e+02 1.195034224421750e+02 3.554637678715695e+02 -3.193415679485398e+02 + 4 6.534146720295859e+02 7.094924702572415e+01 -2.512548059360335e+02 5.989891154804570e+02 + ME 2.503605706471657e-04 + +Event 234 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.866526101194274e+02 7.776953530733711e+01 -1.047503781897389e+01 1.693557493124072e+02 + 3 6.012752698516813e+02 5.974840035795013e+02 -4.570329760029637e+01 4.955829083294179e+01 + 4 7.120721200288896e+02 -6.752535388868375e+02 5.617833541927042e+01 -2.189140401453492e+02 + ME 2.174450659280371e-03 + +Event 235 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.032945404607943e+02 1.612889276925247e+02 2.561838854094329e+02 -4.020710050699557e+02 + 3 7.153634726767364e+02 -3.739069589148945e+02 -1.979140468542056e+02 5.768609140624164e+02 + 4 2.813419868624689e+02 2.126180312223699e+02 -5.826983855522716e+01 -1.747899089924608e+02 + ME 8.197713676081942e-04 + +Event 236 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.980797829886611e+02 -9.803971882836205e+00 4.740144261428888e+02 5.123764137440797e+02 + 3 5.519387921056283e+02 -1.638876688381594e+02 -3.209728652821290e+02 -4.180355032606608e+02 + 4 2.499814249057108e+02 1.736916407209956e+02 -1.530415608607599e+02 -9.434091048341890e+01 + ME 2.817086843651441e-04 + +Event 237 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 1.604490925133744e+02 6.212857081252701e+01 9.075394990141039e+01 1.168232534834160e+02 + 3 6.578242662283154e+02 5.348507070161563e+02 -3.810396531957999e+02 3.842224792439631e+01 + 4 6.817266412583111e+02 -5.969792778286831e+02 2.902857032943891e+02 -1.552455014078122e+02 + ME 8.278818464359843e-04 + +Event 238 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.789018340499541e+02 1.069933592962544e+02 -2.572713415352737e+02 1.225197647611559e+01 + 3 4.761759619803054e+02 7.755191627191857e+01 -4.591043622469822e+02 -9.976187456245110e+01 + 4 7.449222039697414e+02 -1.845452755681728e+02 7.163757037822558e+02 8.750989808633528e+01 + ME 4.150558250915721e-02 + +Event 239 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.581461811054764e+02 -3.899520773556200e+02 2.006122777919944e+02 1.326273524830990e+02 + 3 3.013476461129690e+02 -2.996604136348060e+02 3.145663680794621e+01 4.951799549362093e+00 + 4 7.405061727815548e+02 6.896124909904260e+02 -2.320689145999406e+02 -1.375791520324611e+02 + ME 1.362899388823913e-02 + +Event 240 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.932490652975306e+02 -4.094504138983959e+01 -3.300190662632462e+02 4.912793227530680e+02 + 3 3.147487537014151e+02 3.081803657249564e+02 4.097350029662017e+01 -4.912038692507524e+01 + 4 5.920021810010545e+02 -2.672353243351168e+02 2.890455659666261e+02 -4.421589358279928e+02 + ME 2.311468423068974e-03 + +Event 241 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.438703186026562e+01 1.425431959717181e+01 -4.430288595443105e+00 -4.180186016371769e+01 + 3 7.139617398095608e+02 -8.415544716076501e+01 -5.657765076565166e+02 -4.272659242311072e+02 + 4 7.416512283301738e+02 6.990112756359289e+01 5.702067962519594e+02 4.690677843948251e+02 + ME 9.788135494953836e-03 + +Event 242 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.798759956195423e+02 -1.259218082844715e+02 -3.429343473884153e+02 1.041417477651927e+02 + 3 6.208895880511434e+02 5.354328139337264e+02 1.248673426784089e+02 -2.884852319370315e+02 + 4 4.992344163293142e+02 -4.095110056492549e+02 2.180670047100064e+02 1.843434841718388e+02 + ME 4.534836827657824e-05 + +Event 243 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 2.320641800899439e+02 1.658639294991472e+02 7.783463994856530e+01 1.424243988788333e+02 + 3 6.251485586341130e+02 -2.328139095298015e+02 -4.262931976140131e+02 3.935511574875349e+02 + 4 6.427872612759425e+02 6.694998003065480e+01 3.484585576654475e+02 -5.359755563663683e+02 + ME 1.075142889772466e-02 + +Event 244 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.609991843787813e+02 -2.293678857540617e+02 -4.971623496474938e+02 -3.703240376037023e+02 + 3 1.091403980947071e+02 1.154537470975927e+01 -9.115666825632124e+00 -1.081445118228680e+02 + 4 7.298604175265122e+02 2.178225110443025e+02 5.062780164731259e+02 4.784685494265703e+02 + ME 2.140827479361126e-03 + +Event 245 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.893629130846660e+02 -3.546974954177181e+02 3.112856868655738e+02 -1.294873298810981e+02 + 3 7.129026631852472e+02 5.703735458058532e+02 -4.257115617679145e+02 -4.091322034012453e+01 + 4 2.977344237300872e+02 -2.156760503881351e+02 1.144258749023407e+02 1.704005502212232e+02 + ME 2.551465590883778e-05 + +Event 246 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 3.999457395350199e+02 9.605025124341066e+01 9.072234098128426e+01 3.774922524438974e+02 + 3 3.675469088581874e+02 -1.615841482674672e+01 2.570183669846762e+02 2.622426259669195e+02 + 4 7.325073516067926e+02 -7.989183641666395e+01 -3.477407079659604e+02 -6.397348784108170e+02 + ME 1.315080017103629e-01 + +Event 247 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.711864521923226e+02 3.763073240556692e+02 5.338170415278108e+02 1.546719678644905e+02 + 3 5.231557804938882e+02 -1.057595517177888e+02 -5.121603131388773e+02 -1.409615302513522e+01 + 4 3.056577673137891e+02 -2.705477723378804e+02 -2.165672838893370e+01 -1.405758148393554e+02 + ME 2.878571226351073e-04 + +Event 248 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 6.307803946875937e+02 -6.240065811552295e+01 -3.654556314590156e+02 5.103256270499047e+02 + 3 3.935347424219227e+02 -2.188782290807617e+02 2.916853933646317e+01 -3.257470040392325e+02 + 4 4.756848628904837e+02 2.812788871962846e+02 3.362870921225527e+02 -1.845786230106721e+02 + ME 2.425945648815106e-04 + +Event 249 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 4.326970760901860e+02 -4.070406664121579e+02 -1.467447404863359e+02 3.261392852829556e+00 + 3 4.839435229991530e+02 2.335311811831336e+01 2.018595963184923e+02 -4.392136936630268e+02 + 4 5.833594009106612e+02 3.836875482938445e+02 -5.511485583215643e+01 4.359523008101971e+02 + ME 8.363791387914296e-05 + +Event 250 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.010671671345862e+02 -6.122994886156979e+02 -2.473946684860859e+02 2.353303785738851e+02 + 3 5.574643785654461e+02 3.902114201641946e+02 2.260985614407800e+02 -3.276904354069721e+02 + 4 2.414684542999682e+02 2.220880684515034e+02 2.129610704530567e+01 9.236005683308701e+01 + ME 4.707663105054272e-05 + +Event 251 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.364006127103795e+02 5.379960890463808e+02 4.302640987755425e+02 2.602285070392759e+02 + 3 3.051282143252570e+01 -2.901685968644104e+00 1.337962970917706e+01 -2.726899336532026e+01 + 4 7.330865658570955e+02 -5.350944030777370e+02 -4.436437284847198e+02 -2.329595136739561e+02 + ME 8.478379327521634e-03 + +Event 252 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.965625584838614e+02 -7.369842915522102e+01 -5.671364104158781e+02 -1.697401534860145e+02 + 3 6.549338760881152e+02 -1.514014639568436e+02 6.313240788068731e+02 8.628954906696531e+01 + 4 2.485035654280237e+02 2.250998931120648e+02 -6.418766839099476e+01 8.345060441904933e+01 + ME 3.989409705418268e-05 + +Event 253 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.728678540484714e+02 3.212236187283236e+01 -4.622666283104808e+02 -3.368312580807653e+02 + 3 7.160302400837320e+02 1.132435775281999e+02 5.206369974620781e+02 4.783433011307397e+02 + 4 2.111019058677967e+02 -1.453659394010323e+02 -5.837036915159722e+01 -1.415120430499744e+02 + ME 1.249883411035568e-03 + +Event 254 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 5.579357369440609e+02 1.333150067790222e+02 -6.785864805882140e+01 5.375077668373273e+02 + 3 6.202682598689536e+02 -4.039338689731095e+02 2.012068793592835e+02 -4.255419314189537e+02 + 4 3.217960031869853e+02 2.706188621940872e+02 -1.333482313004621e+02 -1.119658354183736e+02 + ME 6.064955183459296e-04 + +Event 255 Batch 1 + 0 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 7.500000000000000e+02 + 1 7.500000000000000e+02 0.000000000000000e+00 0.000000000000000e+00 -7.500000000000000e+02 + 2 7.263612771087841e+02 3.396063850675521e+02 -6.401091575508393e+02 5.028393902637346e+01 + 3 1.540578578981474e+02 -3.080387127739227e+01 1.060177193258910e+02 -1.074485378375538e+02 + 4 6.195808649930683e+02 -3.088025137901597e+02 5.340914382249483e+02 5.716459881118024e+01 + ME 1.551641178307155e-04 + From 6b667870ba0b2c86c4eec6e16f56e950390b6b52 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 14 Mar 2024 16:36:50 +0100 Subject: [PATCH 96/96] [susy2] in CODEGEN model_handling.py, add comments in write_hardcoded_parameters (fix my wrong comment about std::complex string replace - thanks to Olivier for pointing this out) --- .../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 5f407c35fc..80bb511c73 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -720,8 +720,10 @@ def write_hardcoded_parameters(self, params, deviceparams=set()): assert parset == '', "pardef is empty but parset is not: '%s'"%parset # AV sanity check (both are empty) res = '// (none)\n' return res + #=== Replace patterns in pardef (left of the assignment '=') pardef = pardef.replace('std::complex<','cxsmpl<') # custom simplex complex class (with constexpr arithmetics) - parset = parset.replace('std::complex<','cxsmpl<') # NB this is really needed twice! (if there are two std::complex on the same line) + #=== Replace patterns in parset (right of the assignment '=') + parset = parset.replace('std::complex<','cxsmpl<') # custom simplex complex class (with constexpr arithmetics) parset = parset.replace('sqrt(','constexpr_sqrt(') # constexpr sqrt (based on iterative Newton-Raphson approximation) parset = parset.replace('pow(','constexpr_pow(') # constexpr pow parset = parset.replace('atan(','constexpr_atan(') # constexpr atan for BSM #627 @@ -739,6 +741,7 @@ def write_hardcoded_parameters(self, params, deviceparams=set()): parset = parset.replace('*',' * ') parset = parset.replace('/',' / ') parset = parset.replace(',',', ') + #=== Compute pardef_lines from pardef (left of the assignment '=') pardef_lines = {} for line in pardef.split('\n'): ###print(line) # for debugging @@ -754,6 +757,7 @@ def write_hardcoded_parameters(self, params, deviceparams=set()): ###misc.sprint( 'pardef_lines size =', len(pardef_lines), ', keys size =', len(pardef_lines.keys()) ) ###print( pardef_lines ) # for debugging ###for line in pardef_lines: misc.sprint(line) # for debugging + #=== Compute parset_lines from parset (right of the assignment '=') parset_pars = [] parset_lines = {} skipnextline = False @@ -771,6 +775,7 @@ def write_hardcoded_parameters(self, params, deviceparams=set()): ###misc.sprint( 'parset_lines size =', len(parset_lines), ', keys size =', len(parset_lines.keys()) ) ###print( parset_lines ) # for debugging ###for line in parset_lines: misc.sprint(line) # for debugging + #=== Assemble pardef_lines and parset_lines into a single string res and then replace patterns in res assert len(pardef_lines) == len(parset_lines), 'len(pardef_lines) != len(parset_lines)' # AV sanity check (same number of parameters) res = ' '.join( pardef_lines[par] + ' = ' + parset_lines[par] + '\n' for par in parset_pars ) # no leading ' ' on first row res = res.replace(' ;',';')